https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69489

--- Comment #14 from Julian Taylor <jtaylor.debian at googlemail dot com> ---
I am on x86_64. It actually does vectorize with -mavx but not with -msse2.
The other variant of the loop I posted does vectorize with sse2.


$ gcc --version
gcc (GCC) 7.0.0 20160421 (experimental)
Copyright (C) 2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.


$ cat test.c

double
yule_bool_distance_char2(const char *u, const char *v, long n)
{
    long i;
    long ntt = 0l, nff = 0l, nft = 0l, ntf = 0l;

    for (i = 0l; i < n; i++) {
        ntf += (u[i] && !v[i]);
        nft += (!u[i] && v[i]);
    }   
    return (2.0 * ntf * nft);
}


$ gcc -O2 -ftree-vectorize test.c -c
#same with O3
$ objdump -d test.o

test.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <yule_bool_distance_char2>:
   0:   48 85 d2                test   %rdx,%rdx
   3:   7e 69                   jle    6e <yule_bool_distance_char2+0x6e>
   5:   55                      push   %rbp
   6:   53                      push   %rbx
   7:   45 31 d2                xor    %r10d,%r10d
   a:   45 31 db                xor    %r11d,%r11d
   d:   31 c0                   xor    %eax,%eax
   f:   31 ed                   xor    %ebp,%ebp
  11:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)
  18:   44 0f b6 0c 06          movzbl (%rsi,%rax,1),%r9d
  1d:   44 0f b6 04 07          movzbl (%rdi,%rax,1),%r8d
  22:   45 84 c9                test   %r9b,%r9b
  25:   0f 94 c3                sete   %bl
  28:   31 c9                   xor    %ecx,%ecx
  2a:   45 84 c0                test   %r8b,%r8b
  2d:   0f 95 c1                setne  %cl
  30:   48 21 d9                and    %rbx,%rcx
  33:   49 01 ca                add    %rcx,%r10
  36:   31 c9                   xor    %ecx,%ecx
  38:   45 84 c9                test   %r9b,%r9b
  3b:   0f 95 c1                setne  %cl
  3e:   45 84 c0                test   %r8b,%r8b
  41:   48 0f 45 cd             cmovne %rbp,%rcx
  45:   48 83 c0 01             add    $0x1,%rax
  49:   49 01 cb                add    %rcx,%r11
  4c:   48 39 c2                cmp    %rax,%rdx
  4f:   75 c7                   jne    18 <yule_bool_distance_char2+0x18>
  51:   66 0f ef c0             pxor   %xmm0,%xmm0
  55:   66 0f ef c9             pxor   %xmm1,%xmm1
  59:   5b                      pop    %rbx
  5a:   f2 49 0f 2a c2          cvtsi2sd %r10,%xmm0
  5f:   f2 49 0f 2a cb          cvtsi2sd %r11,%xmm1
  64:   5d                      pop    %rbp
  65:   f2 0f 58 c0             addsd  %xmm0,%xmm0
  69:   f2 0f 59 c1             mulsd  %xmm1,%xmm0
  6d:   c3                      retq   
  6e:   66 0f ef c0             pxor   %xmm0,%xmm0
  72:   c3                      retq

Reply via email to