Thanks for looking at this for me. In simplifying the test case for a bug report, I've narrowed the "problem" to integer overflow considerations. My len variable is declared int, and the target has 64-bit pointers. I'm gathering that the "manual transformation" I quoted below is not considered "equivalent" to the original source code due to different integer overflow behaviors. If I redeclare len to be unsigned long long, then I automatically get the optimizations that I was originally expecting.
I suppose this is really NOT a bug? Is there a compiler optimization flag that allows the optimizer to ignore array index integer overflow in considering legal optimizations? On 7/13/18 9:14 PM, Bin.Cheng wrote: > On Fri, Jul 13, 2018 at 6:04 AM, Kelvin Nilsen <kdnil...@linux.ibm.com> wrote: >> A somewhat old "issue report" pointed me to the code generated for a 4-fold >> manually unrolled version of the following loop: >> >>> while (++len != len_limit) /* this is loop */ >>> if (pb[len] != cur[len]) >>> break; >> >> As unrolled, the loop appears as: >> >>> while (++len != len_limit) /* this is loop */ { >>> if (pb[len] != cur[len]) >>> break; >>> if (++len == len_limit) /* unrolled 2nd iteration */ >>> break; >>> if (pb[len] != cur[len]) >>> break; >>> if (++len == len_limit) /* unrolled 3rd iteration */ >>> break; >>> if (pb[len] != cur[len]) >>> break; >>> if (++len == len_limit) /* unrolled 4th iteration */ >>> break; >>> if (pb[len] != cur[len]) >>> break; >>> } >> >> In examining the behavior of tree-ssa-loop-ivopts.c, I've discovered the >> only induction variable candidates that are being considered are all forms >> of the len variable. We are not considering any induction variables to >> represent the address expressions &pb[len] and &cur[len]. >> >> I rewrote the source code for this loop to make the addressing expressions >> more explicit, as in the following: >> >>> cur++; >>> while (++pb != last_pb) /* this is loop */ { >>> if (*pb != *cur) >>> break; >>> ++cur; >>> if (++pb == last_pb) /* unrolled 2nd iteration */ >>> break; >>> if (*pb != *cur) >>> break; >>> ++cur; >>> if (++pb == last_pb) /* unrolled 3rd iteration */ >>> break; >>> if (*pb != *cur) >>> break; >>> ++cur; >>> if (++pb == last_pb) /* unrolled 4th iteration */ >>> break; >>> if (*pb != *cur) >>> break; >>> ++cur; >>> } >> >> Now, gcc does a better job of identifying the "address expression induction >> variables". This version of the loop runs about 10% faster than the >> original on my target architecture. >> >> This would seem to be a textbook pattern for the induction variable >> analysis. Does anyone have any thoughts on the best way to add these >> candidates to the set of induction variables that are considered by >> tree-ssa-loop-ivopts.c? >> >> Thanks in advance for any suggestions. >> > Hi, > Could you please file a bug with your original slow test code > attached? I tried to construct meaningful test case from your code > snippet but not successful. There is difference in generated > assembly, but it's not that fundamental. So a bug with preprocessed > test would be high appreciated. > I think there are two potential issues in cost computation for such > case: invariant expression and iv uses outside of loop handled as > inside uses. > > Thanks, > bin > >
#include <stdlib.h> #include <string.h> int bt_skip_func(const __uint64_t len_limit, const __uint8_t *cur, long long int delta, __uint64_t len) { const __uint8_t *pb = cur - delta; while (++len != len_limit) { if (pb[len] != cur[len]) break; if (++len == len_limit) break; if (pb[len] != cur[len]) break; if (++len == len_limit) break; if (pb[len] != cur[len]) break; if (++len == len_limit) break; if (pb[len] != cur[len]) break; } return len; } int main (int argc, char *argv[]) { char *text_input = "ttttttttttttttttthis is some text that should be longer"; unsigned long long int len_limit = strlen (text_input); int pos = 0; int cur_match = 0; int depth = 0; int result = bt_skip_func(len_limit, text_input + 3, (long long) 3, (unsigned long long) 1); }
.file "ivsimple.long.c" .abiversion 2 .section ".text" .align 2 .p2align 4,,15 .globl bt_skip_func .type bt_skip_func, @function bt_skip_func: .LFB22: .cfi_startproc subf 5,5,4 # 13 [c=4 l=4] *subfdi3 std 28,-32(1) # 107 [c=4 l=4] *movdi_internal64/0 std 29,-24(1) # 108 [c=4 l=4] *movdi_internal64/0 addi 8,6,1 # 14 [c=4 l=4] *adddi3/1 std 30,-16(1) # 109 [c=4 l=4] *movdi_internal64/0 std 31,-8(1) # 110 [c=4 l=4] *movdi_internal64/0 .cfi_offset 28, -32 .cfi_offset 29, -24 .cfi_offset 30, -16 .cfi_offset 31, -8 addi 12,5,1 # 18 [c=4 l=4] *adddi3/1 addi 30,5,2 # 29 [c=4 l=4] *adddi3/1 addi 28,5,3 # 40 [c=4 l=4] *adddi3/1 addi 11,4,1 # 19 [c=4 l=4] *adddi3/1 addi 31,4,2 # 30 [c=4 l=4] *adddi3/1 addi 29,4,3 # 41 [c=4 l=4] *adddi3/1 b .L2 # 153 [c=4 l=4] jump .p2align 4,,15 .L4: lbzx 10,12,6 # 20 [c=8 l=4] zero_extendqisi2/0 lbzx 7,11,6 # 21 [c=8 l=4] zero_extendqisi2/0 cmpw 7,10,7 # 22 [c=4 l=4] *cmpsi_signed bne 7,.L5 # 23 [c=4 l=4] *rs6000.md:12311 beq 5,.L3 # 27 [c=4 l=4] *rs6000.md:12311 lbzx 10,30,6 # 31 [c=8 l=4] zero_extendqisi2/0 lbzx 7,31,6 # 32 [c=8 l=4] zero_extendqisi2/0 addi 8,8,4 # 51 [c=4 l=4] *adddi3/1 cmpw 7,10,7 # 33 [c=4 l=4] *cmpsi_signed bne 7,.L3 # 34 [c=4 l=4] *rs6000.md:12311 addi 9,6,3 # 36 [c=4 l=4] *adddi3/1 cmpld 7,3,9 # 37 [c=4 l=4] *cmpdi_unsigned beq 7,.L3 # 38 [c=4 l=4] *rs6000.md:12311 lbzx 10,28,6 # 42 [c=8 l=4] zero_extendqisi2/0 lbzx 7,29,6 # 43 [c=8 l=4] zero_extendqisi2/0 addi 6,6,4 # 47 [c=4 l=4] *adddi3/1 cmpld 5,3,6 # 48 [c=4 l=4] *cmpdi_unsigned cmpw 7,10,7 # 44 [c=4 l=4] *cmpsi_signed bne 7,.L3 # 45 [c=4 l=4] *rs6000.md:12311 beq 5,.L6 # 49 [c=4 l=4] *rs6000.md:12311 lbzx 9,5,6 # 52 [c=8 l=4] zero_extendqisi2/0 lbzx 10,4,6 # 53 [c=8 l=4] zero_extendqisi2/0 cmpw 7,9,10 # 54 [c=4 l=4] *cmpsi_signed bne 7,.L7 # 55 [c=4 l=4] *rs6000.md:12311 .L2: cmpld 7,3,8 # 59 [c=4 l=4] *cmpdi_unsigned addi 9,6,2 # 25 [c=4 l=4] *adddi3/1 cmpld 5,3,9 # 26 [c=4 l=4] *cmpdi_unsigned bne 7,.L4 # 60 [c=4 l=4] *rs6000.md:12311 .L6: mr 9,3 # 7 [c=4 l=4] *movdi_internal64/2 .L3: ld 28,-32(1) # 113 [c=8 l=4] *movdi_internal64/1 ld 29,-24(1) # 114 [c=8 l=4] *movdi_internal64/1 ld 30,-16(1) # 115 [c=8 l=4] *movdi_internal64/1 ld 31,-8(1) # 116 [c=8 l=4] *movdi_internal64/1 extsw 3,9 # 69 [c=4 l=4] extendsidi2/1 .cfi_remember_state .cfi_restore 31 .cfi_restore 30 .cfi_restore 29 .cfi_restore 28 blr # 118 [c=4 l=4] simple_return .p2align 4,,15 .L5: .cfi_restore_state ld 28,-32(1) # 131 [c=8 l=4] *movdi_internal64/1 ld 29,-24(1) # 132 [c=8 l=4] *movdi_internal64/1 ld 30,-16(1) # 133 [c=8 l=4] *movdi_internal64/1 ld 31,-8(1) # 134 [c=8 l=4] *movdi_internal64/1 mr 9,8 # 8 [c=4 l=4] *movdi_internal64/2 extsw 3,9 # 128 [c=4 l=4] extendsidi2/1 .cfi_remember_state .cfi_restore 31 .cfi_restore 30 .cfi_restore 29 .cfi_restore 28 blr # 136 [c=4 l=4] simple_return .p2align 4,,15 .L7: .cfi_restore_state ld 28,-32(1) # 144 [c=8 l=4] *movdi_internal64/1 ld 29,-24(1) # 145 [c=8 l=4] *movdi_internal64/1 ld 30,-16(1) # 146 [c=8 l=4] *movdi_internal64/1 ld 31,-8(1) # 147 [c=8 l=4] *movdi_internal64/1 mr 9,6 # 9 [c=4 l=4] *movdi_internal64/2 extsw 3,9 # 141 [c=4 l=4] extendsidi2/1 .cfi_restore 31 .cfi_restore 30 .cfi_restore 29 .cfi_restore 28 blr # 149 [c=4 l=4] simple_return .long 0 .byte 0,0,0,0,0,4,0,0 .cfi_endproc .LFE22: .size bt_skip_func,.-bt_skip_func .section .text.startup,"ax",@progbits .align 2 .p2align 4,,15 .globl main .type main, @function main: .LFB23: .cfi_startproc li 3,0 # 11 [c=4 l=4] *movdi_internal64/3 blr # 18 [c=4 l=4] simple_return .long 0 .byte 0,0,0,0,0,0,0,0 .cfi_endproc .LFE23: .size main,.-main .ident "GCC: (GNU) 9.0.0 20180716 (experimental)" .section .note.GNU-stack,"",@progbits
.file "ivsimple.transformed.c" .abiversion 2 .section ".text" .align 2 .p2align 4,,15 .globl bt_skip_func .type bt_skip_func, @function bt_skip_func: .LFB22: .cfi_startproc subf 5,5,4 # 13 [c=4 l=4] *subfdi3 addi 9,6,1 # 18 [c=4 l=4] *adddi3/1 add 3,5,3 # 14 [c=4 l=4] *adddi3/0 add 9,5,9 # 19 [c=4 l=4] *adddi3/0 cmpld 7,3,9 # 20 [c=4 l=4] *cmpdi_unsigned addi 6,6,1 # 15 [c=4 l=4] *addsi3/1 rldicl 6,6,0,32 # 16 [c=4 l=4] zero_extendsidi2/1 add 4,4,6 # 17 [c=4 l=4] *adddi3/0 bne 7,.L3 # 21 [c=4 l=4] *rs6000.md:12311 b .L6 # 124 [c=4 l=4] jump .p2align 4,,15 .L9: beq 5,.L2 # 31 [c=4 l=4] *rs6000.md:12311 lbz 8,1(9) # 33 [c=8 l=4] zero_extendqisi2/0 lbz 7,1(4) # 34 [c=8 l=4] zero_extendqisi2/0 cmpw 7,8,7 # 35 [c=4 l=4] *cmpsi_signed bne 7,.L2 # 36 [c=4 l=4] *rs6000.md:12311 addi 10,9,2 # 38 [c=4 l=4] *adddi3/1 cmpld 7,3,10 # 39 [c=4 l=4] *cmpdi_unsigned beq 7,.L2 # 40 [c=4 l=4] *rs6000.md:12311 lbz 8,2(9) # 42 [c=8 l=4] zero_extendqisi2/0 lbz 7,2(4) # 43 [c=8 l=4] zero_extendqisi2/0 cmpw 7,8,7 # 44 [c=4 l=4] *cmpsi_signed bne 7,.L2 # 45 [c=4 l=4] *rs6000.md:12311 addi 10,9,3 # 47 [c=4 l=4] *adddi3/1 cmpld 7,10,3 # 48 [c=4 l=4] *cmpdi_unsigned beq 7,.L6 # 49 [c=4 l=4] *rs6000.md:12311 lbz 8,3(9) # 51 [c=8 l=4] zero_extendqisi2/0 lbz 7,3(4) # 52 [c=8 l=4] zero_extendqisi2/0 addi 9,9,4 # 57 [c=4 l=4] *adddi3/1 addi 4,4,4 # 56 [c=4 l=4] *adddi3/1 cmpld 5,3,9 # 59 [c=4 l=4] *cmpdi_unsigned cmpw 7,8,7 # 53 [c=4 l=4] *cmpsi_signed bne 7,.L2 # 54 [c=4 l=4] *rs6000.md:12311 beq 5,.L6 # 60 [c=4 l=4] *rs6000.md:12311 .L3: lbz 8,0(9) # 23 [c=8 l=4] zero_extendqisi2/0 lbz 7,0(4) # 24 [c=8 l=4] zero_extendqisi2/0 addi 10,9,1 # 29 [c=4 l=4] *adddi3/1 cmpld 5,3,10 # 30 [c=4 l=4] *cmpdi_unsigned cmpw 7,8,7 # 25 [c=4 l=4] *cmpsi_signed beq 7,.L9 # 26 [c=4 l=4] *rs6000.md:12311 mr 10,9 # 7 [c=4 l=4] *movdi_internal64/2 .L2: subf 3,5,10 # 63 [c=4 l=4] *subfdi3 extsw 3,3 # 70 [c=4 l=4] extendsidi2/1 blr # 104 [c=4 l=4] simple_return .p2align 4,,15 .L6: mr 10,3 # 9 [c=4 l=4] *movdi_internal64/2 subf 3,5,10 # 115 [c=4 l=4] *subfdi3 extsw 3,3 # 116 [c=4 l=4] extendsidi2/1 blr # 119 [c=4 l=4] simple_return .long 0 .byte 0,0,0,0,0,0,0,0 .cfi_endproc .LFE22: .size bt_skip_func,.-bt_skip_func .section ".toc","aw" .align 3 .LC1: .quad .LC0+3 .section .text.startup,"ax",@progbits .align 2 .p2align 4,,15 .globl main .type main, @function main: .LFB23: .cfi_startproc .LCF1: 0: addis 2,12,.TOC.-.LCF1@ha addi 2,2,.TOC.-.LCF1@l .localentry main,.-main mflr 0 # 23 [c=4 l=4] *movdi_internal64/21 addis 4,2,.LC1@toc@ha # 35 [c=12 l=8] fusion_gpr_load_di ld 4,.LC1@toc@l(4) li 6,1 # 8 [c=4 l=4] *movdi_internal64/3 li 5,3 # 9 [c=4 l=4] *movdi_internal64/3 li 3,55 # 11 [c=4 l=4] *movdi_internal64/3 std 0,16(1) # 24 [c=4 l=4] *movdi_internal64/0 stdu 1,-32(1) # 25 [c=4 l=4] movdi_di_update/1 .cfi_def_cfa_offset 32 .cfi_offset 65, 16 bl bt_skip_func # 12 [c=4 l=4] *call_value_local_aixdi addi 1,1,32 # 28 [c=4 l=4] *adddi3/1 .cfi_def_cfa_offset 0 li 3,0 # 17 [c=4 l=4] *movdi_internal64/3 ld 0,16(1) # 29 [c=8 l=4] *movdi_internal64/1 mtlr 0 # 30 [c=4 l=4] *movdi_internal64/22 .cfi_restore 65 blr # 31 [c=4 l=4] simple_return .long 0 .byte 0,0,0,1,128,0,0,0 .cfi_endproc .LFE23: .size main,.-main .section .rodata.str1.8,"aMS",@progbits,1 .align 3 .LC0: .string "ttttttttttttttttthis is some text that should be longer" .ident "GCC: (GNU) 9.0.0 20180716 (experimental)" .section .note.GNU-stack,"",@progbits
#include <stdlib.h> #include <string.h> int bt_skip_func(const __uint32_t len_limit, const __uint8_t *cur, int delta, __uint32_t len) { const __uint8_t *pb = cur - delta; const __uint8_t *orig_pb = pb; const __uint8_t *last_pb = pb + len_limit; pb += len; cur += len + 1; while (++pb != last_pb) { if (*pb != *cur) break; cur++; /* 2nd iteration */ if (++pb == last_pb) break; if (*pb != *cur) break; cur++; /* 3rd iteration */ if (++pb == last_pb) break; if (*pb != *cur) break; cur++; /* 4th iteration */ if (++pb == last_pb) break; if (*pb != *cur) break; cur++; /* 4th iteration */ } return pb - orig_pb; } int main (int argc, char *argv[]) { char *text_input = "ttttttttttttttttthis is some text that should be longer"; int len_limit = strlen (text_input); int pos = 0; int cur_match = 0; int depth = 0; int result = bt_skip_func(len_limit, text_input + 3, 3, 1); }
.file "ivsimple.c" .abiversion 2 .section ".text" .align 2 .p2align 4,,15 .globl bt_skip_func .type bt_skip_func, @function bt_skip_func: .LFB22: .cfi_startproc addi 10,6,1 # 14 [c=4 l=4] *addsi3/1 subf 5,5,4 # 13 [c=4 l=4] *subfdi3 rldicl 10,10,0,32 # 15 [c=4 l=4] zero_extendsidi2/1 b .L2 # 128 [c=4 l=4] jump .p2align 4,,15 .L4: lbzx 8,5,10 # 20 [c=8 l=4] zero_extendqisi2/0 lbzx 11,4,10 # 21 [c=8 l=4] zero_extendqisi2/0 cmpw 7,8,11 # 22 [c=4 l=4] *cmpsi_signed bne 7,.L5 # 23 [c=4 l=4] *rs6000.md:12311 beq 5,.L3 # 28 [c=4 l=4] *rs6000.md:12311 lbzx 8,5,9 # 31 [c=8 l=4] zero_extendqisi2/0 lbzx 11,4,9 # 32 [c=8 l=4] zero_extendqisi2/0 rldicl 10,0,0,32 # 54 [c=4 l=4] zero_extendsidi2/1 cmpw 7,8,11 # 33 [c=4 l=4] *cmpsi_signed bne 7,.L3 # 34 [c=4 l=4] *rs6000.md:12311 rldicl 9,7,0,32 # 37 [c=4 l=4] zero_extendsidi2/1 beq 6,.L3 # 39 [c=4 l=4] *rs6000.md:12311 lbzx 8,5,9 # 42 [c=8 l=4] zero_extendqisi2/0 lbzx 7,4,9 # 43 [c=8 l=4] zero_extendqisi2/0 cmpw 7,8,7 # 44 [c=4 l=4] *cmpsi_signed bne 7,.L3 # 45 [c=4 l=4] *rs6000.md:12311 beq 1,.L7 # 50 [c=4 l=4] *rs6000.md:12311 lbzx 9,5,6 # 55 [c=8 l=4] zero_extendqisi2/0 lbzx 8,4,6 # 56 [c=8 l=4] zero_extendqisi2/0 cmpw 7,9,8 # 57 [c=4 l=4] *cmpsi_signed bne 7,.L7 # 58 [c=4 l=4] *rs6000.md:12311 .L2: cmplw 7,3,10 # 62 [c=4 l=4] *cmpsi_unsigned addi 9,6,2 # 25 [c=4 l=4] *addsi3/1 addi 7,6,3 # 36 [c=4 l=4] *addsi3/1 addi 6,6,4 # 47 [c=4 l=4] *addsi3/1 cmplw 5,3,9 # 27 [c=4 l=4] *cmpsi_unsigned cmplw 1,3,6 # 49 [c=4 l=4] *cmpsi_unsigned cmplw 6,3,7 # 38 [c=4 l=4] *cmpsi_unsigned addi 0,10,4 # 53 [c=4 l=4] *addsi3/1 rldicl 9,9,0,32 # 26 [c=4 l=4] zero_extendsidi2/1 rldicl 6,6,0,32 # 48 [c=4 l=4] zero_extendsidi2/1 bne 7,.L4 # 63 [c=4 l=4] *rs6000.md:12311 mr 9,3 # 10 [c=4 l=4] *movdi_internal64/2 .L3: extsw 3,9 # 71 [c=4 l=4] extendsidi2/1 blr # 105 [c=4 l=4] simple_return .p2align 4,,15 .L7: mr 9,6 # 9 [c=4 l=4] *movdi_internal64/2 extsw 3,9 # 113 [c=4 l=4] extendsidi2/1 blr # 116 [c=4 l=4] simple_return .p2align 4,,15 .L5: mr 9,10 # 8 [c=4 l=4] *movdi_internal64/2 extsw 3,9 # 121 [c=4 l=4] extendsidi2/1 blr # 124 [c=4 l=4] simple_return .long 0 .byte 0,0,0,0,0,0,0,0 .cfi_endproc .LFE22: .size bt_skip_func,.-bt_skip_func .section ".toc","aw" .align 3 .LC1: .quad .LC0+3 .section .text.startup,"ax",@progbits .align 2 .p2align 4,,15 .globl main .type main, @function main: .LFB23: .cfi_startproc .LCF1: 0: addis 2,12,.TOC.-.LCF1@ha addi 2,2,.TOC.-.LCF1@l .localentry main,.-main mflr 0 # 23 [c=4 l=4] *movdi_internal64/21 addis 4,2,.LC1@toc@ha # 35 [c=12 l=8] fusion_gpr_load_di ld 4,.LC1@toc@l(4) li 6,1 # 8 [c=4 l=4] *movdi_internal64/3 li 5,3 # 9 [c=4 l=4] *movdi_internal64/3 li 3,55 # 11 [c=4 l=4] *movdi_internal64/3 std 0,16(1) # 24 [c=4 l=4] *movdi_internal64/0 stdu 1,-32(1) # 25 [c=4 l=4] movdi_di_update/1 .cfi_def_cfa_offset 32 .cfi_offset 65, 16 bl bt_skip_func # 12 [c=4 l=4] *call_value_local_aixdi addi 1,1,32 # 28 [c=4 l=4] *adddi3/1 .cfi_def_cfa_offset 0 li 3,0 # 17 [c=4 l=4] *movdi_internal64/3 ld 0,16(1) # 29 [c=8 l=4] *movdi_internal64/1 mtlr 0 # 30 [c=4 l=4] *movdi_internal64/22 .cfi_restore 65 blr # 31 [c=4 l=4] simple_return .long 0 .byte 0,0,0,1,128,0,0,0 .cfi_endproc .LFE23: .size main,.-main .section .rodata.str1.8,"aMS",@progbits,1 .align 3 .LC0: .string "ttttttttttttttttthis is some text that should be longer" .ident "GCC: (GNU) 9.0.0 20180716 (experimental)" .section .note.GNU-stack,"",@progbits
#include <stdlib.h> #include <string.h> int bt_skip_func(const __uint32_t len_limit, const __uint8_t *cur, int delta, __uint32_t len) { const __uint8_t *pb = cur - delta; while (++len != len_limit) { if (pb[len] != cur[len]) break; if (++len == len_limit) break; if (pb[len] != cur[len]) break; if (++len == len_limit) break; if (pb[len] != cur[len]) break; if (++len == len_limit) break; if (pb[len] != cur[len]) break; } return len; } int main (int argc, char *argv[]) { char *text_input = "ttttttttttttttttthis is some text that should be longer"; int len_limit = strlen (text_input); int pos = 0; int cur_match = 0; int depth = 0; int result = bt_skip_func(len_limit, text_input + 3, 3, 1); }