https://gcc.gnu.org/bugzilla/show_bug.cgi?id=50374
vincenzo Innocente <vincenzo.innocente at cern dot ch> changed: What |Removed |Added ---------------------------------------------------------------------------- Known to fail| |4.9.1 --- Comment #27 from vincenzo Innocente <vincenzo.innocente at cern dot ch> --- coming back to this old issue. Any chance to see it implemented in the auto-vectorizer soon? using "extended vectors" I manage to vectorize "min_element" as below. In principle the auto-vectorizer should be able to do the same starting from the loop in comment 3 typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t; typedef float __attribute__( ( vector_size( 16 ) , aligned(4) ) ) float32x4a4_t; typedef int __attribute__( ( vector_size( 16 ) ) ) int32x4_t; inline float32x4_t load(float const * x) { return *(float32x4a4_t const *)(x); } int minloc(float const * x, int N) { float32x4_t v0; int32x4_t index; auto M = 4*(N/4); for (int i=M; i<N; ++i) { v0[i-M] = x[i]; index[i]=i; } for (int i=N; i<M+4;++i) { v0[i-M] = x[0]; index[i]=0; } int32x4_t j = {0,1,2,3}; for (int i=0; i<M; i+=4) { decltype(auto) v = load(x+i); index = (v<v0) ? j : index; v0 = (v<v0) ? v : v0; j+=4; } auto k = 0; for (int i=1;i<4; ++i) if (v0[i]<v0[k]) k=i; return index[k]; } #include<iostream> #include<algorithm> #include <x86intrin.h> unsigned int taux=0; inline unsigned long long rdtscp() { return __rdtscp(&taux); } int main() { float x[1024]; for (int i=0; i<1024; ++i) x[i]= i%2 ? i : -i; for (int i = 0; i<10; ++i) { std::random_shuffle(x,x+1024); long long ts = -rdtscp(); int l1 = std::min_element(x+i,x+1024) - (x+i); ts +=rdtscp(); long long tv = -rdtscp(); int l2 = minloc(x+i,1024-i); tv +=rdtscp(); std::cout << "min is at " << l1 << ' ' << ts << std::endl; std::cout << "minloc " << l2 << ' ' << tv << std::endl; } return 0; } which result in a pretty good speed up c++ -std=c++1y -Ofast minloc.cc -march=nehalem ./a.out ./a.out min is at 959 13780 minloc 959 2380 min is at 536 13680 minloc 536 4972 min is at 513 13648 minloc 513 1848 min is at 825 13640 minloc 825 1924 min is at 885 13628 minloc 885 1644 min is at 636 11252 minloc 636 1536 min is at 982 11240 minloc 982 1416 min is at 382 11228 minloc 382 1392 min is at 271 11216 minloc 271 1340 min is at 50 11204 minloc 50 1384