Ian Lance Taylor <i...@google.com>, wrote: > Tests that directly invoke __builtin functions are not appropriate for > your replacement for emmintrin.h.
Clearly. However, I do not see why these are in the test routines in the first place. They seem not to be needed. I made the changes below my signature, eliminating all of the vector builtins, and the programs still worked with both -msse2 and -mno-sse2 plus my software SSE2. If anything the test programs are much easier to understand without the builtins. There is also a (big) problem with sse2-vec-2.c (and -2a, which is empty other than an #include sse2-vec-2.c). There are no explicit sse2 operations within this test program. Moreover, the code within the tests does not work. Finally, if one puts a print statement anywhere in the test that is there, compiles it with: gcc -msse -msse2 there will be no warnings, and the run will appear to show a valid test, but in actuality the test will never execute! This shows part of the problem: gcc -Wall -msse -msse2 -o foo sse2-vec-2.c sse-os-support.h:27: warning: 'sse_os_support' defined but not used sse2-check.h:10: warning: 'do_test' defined but not used (also for -m64) There must be some sort of main in there, but no test, it does nothing and returns a valid status. When stuffed with debug statements: for (i = 0; i < 2; i++) masks[i] = i; printf("DEBUG res[0] %llX\n",res[0]); printf("DEBUG res[1] %llX\n",res[1]); printf("DEBUG val1.ll[0] %llX\n",val1.ll[0]); printf("DEBUG val1.ll[1] %llX\n",val1.ll[1]); for (i = 0; i < 2; i++) if (res[i] != val1.ll [masks[i]]){ printf("DEBUG i %d\n",i); printf("DEBUG masks[i] %d\n",masks[i]); printf("DEBUG val1.ll [masks[i]] %llX\n", val1.ll [masks[i]]); abort (); } and compiled with my software SSE2 gcc -Wall -msse -mno-sse2 -I. -O0 -m32 -lm -DSOFT_SSE2 -DEMMSOFTDBG -o foo sse2-vec-2.c It emits: DEBUG res[0] 3020100 DEBUG res[1] 7060504 DEBUG val1.ll[0] 706050403020100 DEBUG val1.ll[1] F0E0D0C0B0A0908 DEBUG i 0 DEBUG masks[i] 0 DEBUG val1.ll [masks[i]] 706050403020100 Aborted True enough 3020100 != 706050403020100, but what kind of test is that??? Regards, David Mathog mat...@caltech.edu Manager, Sequence Analysis Facility, Biology Division, Caltech ******************************************************************** changes to sse2-vec-*.c routines to eliminate all of the __builtin calls: ls -1 sse2-vec*dist | grep -v vec-2 | extract -cols 'diff --context=0 [1,-6] [1,]' | execinput *** sse2-vec-1.c 2010-11-24 09:06:46.000000000 -0800 --- sse2-vec-1.c.dist 2010-11-24 09:06:39.000000000 -0800 *************** *** 27,28 **** ! res[0] = val1.d[msk0]; ! res[1] = val1.d[msk1]; --- 27,28 ---- ! res[0] = __builtin_ia32_vec_ext_v2df ((__v2df)val1.x, msk0); ! res[1] = __builtin_ia32_vec_ext_v2df ((__v2df)val1.x, msk1); *** sse2-vec-3.c 2010-11-24 09:09:13.000000000 -0800 --- sse2-vec-3.c.dist 2010-11-24 09:07:48.000000000 -0800 *************** *** 27,30 **** ! res[0] = val1.i[0]; ! res[1] = val1.i[1]; ! res[2] = val1.i[2]; ! res[3] = val1.i[3]; --- 27,30 ---- ! res[0] = __builtin_ia32_vec_ext_v4si ((__v4si)val1.x, 0); ! res[1] = __builtin_ia32_vec_ext_v4si ((__v4si)val1.x, 1); ! res[2] = __builtin_ia32_vec_ext_v4si ((__v4si)val1.x, 2); ! res[3] = __builtin_ia32_vec_ext_v4si ((__v4si)val1.x, 3); *** sse2-vec-4.c 2010-11-24 09:10:00.000000000 -0800 --- sse2-vec-4.c.dist 2010-11-24 09:07:48.000000000 -0800 *************** *** 27,34 **** ! res[0] = val1.s[0]; ! res[1] = val1.s[1]; ! res[2] = val1.s[2]; ! res[3] = val1.s[3]; ! res[4] = val1.s[4]; ! res[5] = val1.s[5]; ! res[6] = val1.s[6]; ! res[7] = val1.s[7]; --- 27,34 ---- ! res[0] = __builtin_ia32_vec_ext_v8hi ((__v8hi)val1.x, 0); ! res[1] = __builtin_ia32_vec_ext_v8hi ((__v8hi)val1.x, 1); ! res[2] = __builtin_ia32_vec_ext_v8hi ((__v8hi)val1.x, 2); ! res[3] = __builtin_ia32_vec_ext_v8hi ((__v8hi)val1.x, 3); ! res[4] = __builtin_ia32_vec_ext_v8hi ((__v8hi)val1.x, 4); ! res[5] = __builtin_ia32_vec_ext_v8hi ((__v8hi)val1.x, 5); ! res[6] = __builtin_ia32_vec_ext_v8hi ((__v8hi)val1.x, 6); ! res[7] = __builtin_ia32_vec_ext_v8hi ((__v8hi)val1.x, 7); *** sse2-vec-5.c 2010-11-24 09:11:09.000000000 -0800 --- sse2-vec-5.c.dist 2010-11-24 09:07:48.000000000 -0800 *************** *** 27,42 **** ! res[0] = val1.c[0]; ! res[1] = val1.c[1]; ! res[2] = val1.c[2]; ! res[3] = val1.c[3]; ! res[4] = val1.c[4]; ! res[5] = val1.c[5]; ! res[6] = val1.c[6]; ! res[7] = val1.c[7]; ! res[8] = val1.c[8]; ! res[9] = val1.c[9]; ! res[10] = val1.c[10]; ! res[11] = val1.c[11]; ! res[12] = val1.c[12]; ! res[13] = val1.c[13]; ! res[14] = val1.c[14]; ! res[15] = val1.c[15]; --- 27,42 ---- ! res[0] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 0); ! res[1] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 1); ! res[2] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 2); ! res[3] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 3); ! res[4] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 4); ! res[5] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 5); ! res[6] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 6); ! res[7] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 7); ! res[8] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 8); ! res[9] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 9); ! res[10] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 10); ! res[11] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 11); ! res[12] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 12); ! res[13] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 13); ! res[14] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 14); ! res[15] = __builtin_ia32_vec_ext_v16qi ((__v16qi)val1.x, 15); *** sse2-vec-6.c 2010-11-24 09:36:57.000000000 -0800 --- sse2-vec-6.c.dist 2010-11-24 09:07:48.000000000 -0800 *************** *** 28,36 **** ! ! res[0].x = val1.x; res[0].s[0] = ins[0]; ! res[1].x = val1.x; res[1].s[1] = ins[0]; ! res[2].x = val1.x; res[2].s[2] = ins[0]; ! res[3].x = val1.x; res[3].s[3] = ins[0]; ! res[4].x = val1.x; res[4].s[4] = ins[0]; ! res[5].x = val1.x; res[5].s[5] = ins[0]; ! res[6].x = val1.x; res[6].s[6] = ins[0]; ! res[7].x = val1.x; res[7].s[7] = ins[0]; --- 28,43 ---- ! res[0].x = (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)val1.x, ! ins[0], 0); ! res[1].x = (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)val1.x, ! ins[0], 1); ! res[2].x = (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)val1.x, ! ins[0], 2); ! res[3].x = (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)val1.x, ! ins[0], 3); ! res[4].x = (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)val1.x, ! ins[0], 4); ! res[5].x = (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)val1.x, ! ins[0], 5); ! res[6].x = (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)val1.x, ! ins[0], 6); ! res[7].x = (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)val1.x, ! ins[0], 7); *************** *** 47 **** --- 55 ---- + *************** *** 50 **** ! res[i].x = val1.x; res[i].s[0] = ins[i]; --- 58,59 ---- ! res[i].x = (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)val1.x, ! ins[i], 0);