https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66741
--- Comment #1 from Bernhard Reutner-Fischer <aldot at gcc dot gnu.org> ---
i.e. maybe something more along the lines of
$ cat <<EOF | gcc-5 -xc -S - -o - -Ofast -fomit-frame-pointer
-minline-all-stringops -mstringop-strategy=unrolled_loop -fdump-tree-all-all
-fdump-rtl-all-all -fdump-ipa-all-all -msse4
#include <smmintrin.h>
#include <assert.h>
#include <stdint.h>
void
sse_tolower_strcpy (const char *d, const char *s)
{
__m128i ranges =
_mm_setr_epi8 ('A', 'Z', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m128i *src = (__m128i *) s;
__m128i *dst = (__m128i *) d;
const __m128i diff = _mm_set1_epi8 (0x20);
const uint8_t mode = _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK;
for (;; src++, dst++)
{
const __m128i chunk = _mm_loadu_si128 (src);
if (_mm_cmpistrc (ranges, chunk, mode))
{
const __m128i tmp1 = _mm_cmpistrm (ranges, chunk, mode);
const __m128i mask = _mm_and_si128 (tmp1, diff);
_mm_storeu_si128 (dst, _mm_xor_si128 (chunk, mask));
}
if (_mm_cmpistrz (ranges, chunk, mode))
break;
}
}
#ifdef MAIN
#include <unistd.h>
#include <string.h>
int main(void) {
char src[128], dest[128];
int n = read(0, &src, sizeof(src));
if (n < 1)
1;
src[n] = 0;
sse_tolower_strcpy(dest, src);
write(2, dest, strlen(dest));
return 0;
}
#endif
EOF
.file ""
.section .text.unlikely,"ax",@progbits
.LCOLDB2:
.text
.LHOTB2:
.p2align 4,,15
.globl sse_tolower_strcpy
.type sse_tolower_strcpy, @function
sse_tolower_strcpy:
.LFB641:
.cfi_startproc
movdqa .LC0(%rip), %xmm2
movdqa .LC1(%rip), %xmm3
jmp .L4
.p2align 4,,10
.p2align 3
.L2:
pcmpistrm $68, %xmm1, %xmm2
je .L1
.L9:
addq $16, %rsi
addq $16, %rdi
.L4:
movdqu (%rsi), %xmm1
pcmpistrm $68, %xmm1, %xmm2
jnc .L2
pand %xmm3, %xmm0
pxor %xmm1, %xmm0
movups %xmm0, (%rdi)
pcmpistrm $68, %xmm1, %xmm2
jne .L9
.L1:
rep ret
.cfi_endproc
.LFE641:
.size sse_tolower_strcpy, .-sse_tolower_strcpy
.section .text.unlikely
.LCOLDE2:
.text
.LHOTE2:
.section .rodata.cst16,"aM",@progbits,16
.align 16
.LC0:
.byte 65
.byte 90
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.align 16
.LC1:
.quad 2314885530818453536
.quad 2314885530818453536
.ident "GCC: (Debian 5.1.1-12) 5.1.1 20150622"
.section .note.GNU-stack,"",@progbits
This would be *much* smaller and supposedly is also faster:
text data bss dec hex filename
228 0 0 228 e4 comment0.o
153 0 0 153 99 comment1.o