https://gcc.gnu.org/bugzilla/show_bug.cgi?id=50374

vincenzo Innocente <vincenzo.innocente at cern dot ch> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
      Known to fail|                            |4.9.1

--- Comment #27 from vincenzo Innocente <vincenzo.innocente at cern dot ch> ---
coming back to this old issue.
Any chance to see it implemented in the auto-vectorizer soon?

using "extended vectors" I manage to vectorize "min_element" as below.
In principle the auto-vectorizer should be able to do the same starting from
the loop in comment 3


typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
typedef float __attribute__( ( vector_size( 16 ) , aligned(4) ) )
float32x4a4_t;
typedef int __attribute__( ( vector_size( 16 ) ) ) int32x4_t;


inline
float32x4_t load(float const * x) {
   return *(float32x4a4_t const *)(x);
}


int minloc(float const * x, int N) {
  float32x4_t v0;
  int32x4_t index;

  auto M = 4*(N/4);
  for (int i=M; i<N; ++i) {
    v0[i-M] = x[i];
    index[i]=i;
  }
  for (int i=N; i<M+4;++i) {
    v0[i-M] = x[0];
    index[i]=0;
  }
  int32x4_t j = {0,1,2,3};
  for (int i=0; i<M; i+=4) {
    decltype(auto) v = load(x+i);
    index =  (v<v0) ? j : index;
    v0 = (v<v0) ? v : v0;
    j+=4;
  }
  auto k = 0;
  for (int i=1;i<4; ++i) if (v0[i]<v0[k]) k=i;
  return index[k];
}


#include<iostream>
#include<algorithm>
#include <x86intrin.h>
unsigned int taux=0;
inline unsigned long long rdtscp() {
 return __rdtscp(&taux);
}

int main() {

  float x[1024];
  for (int i=0; i<1024; ++i) x[i]= i%2 ? i : -i;
  for (int i = 0; i<10; ++i) {
   std::random_shuffle(x,x+1024);
   long long ts = -rdtscp();
   int l1 = std::min_element(x+i,x+1024) - (x+i);
   ts +=rdtscp();
   long long tv = -rdtscp();    
   int l2 = minloc(x+i,1024-i);
   tv +=rdtscp();

    std::cout << "min is at " << l1 << ' ' << ts << std::endl;
    std::cout << "minloc " << l2 << ' ' << tv << std::endl;
  }
  return 0;

}


which result in a pretty good speed up
c++ -std=c++1y -Ofast minloc.cc -march=nehalem
./a.out
./a.out 
min is at 959 13780
minloc 959 2380
min is at 536 13680
minloc 536 4972
min is at 513 13648
minloc 513 1848
min is at 825 13640
minloc 825 1924
min is at 885 13628
minloc 885 1644
min is at 636 11252
minloc 636 1536
min is at 982 11240
minloc 982 1416
min is at 382 11228
minloc 382 1392
min is at 271 11216
minloc 271 1340
min is at 50 11204
minloc 50 1384

Reply via email to