On 10.02.2019 10:33, Yuriy M. Kaminskiy wrote: > On 09.02.2019 20:48, Yuriy M. Kaminskiy wrote: >> 2) it can expose bugs on some cpus, that are not caught by testsuite >> (e.g. [previous version of] my _mm_extract_epi64 replacement was very >> buggy - but it was not detected by debian's qemu test [as it only runs >> only single unit test for gf64, but only gf128 was affected]); > > Just in case, I (ab)used valgrind hook to run complete testsuite for all > combination of flags for i386 and amd64, incremental debdiff attached (on the > top of above). > > Known problems: takes a bit too long to run (2.5 hours for amd64, and even > more for i386).
Unfortunately, after playing a bit, 1) runtime cpu detection on i386 was not enabled in sources (so, [on i386] all simd code was compiled, but not used); new patch added; 2) qemu-user in stretch does not parse -cpu in a way this test expect it to (so my tests was flawed); I added B-D on version in buster. 3) with (1) & (2) fixed, I found that i386 sse4/pclmul was broken - due to my mistake (_mm_extract_epi64 emulation was broken; caught by unit-test); patch 0001* updated; 4) with (3) fixed, I discovered that my qemu-for-complete-testsuite patch was flawed - gf_method should run on same "cpu" as gf_unit (some tests are activated only on supporting cpu); see patch 0007* (not for upstream, too "hacky") and debian/rules update. I rebuild and retested package on i386 and amd64, everything seems fine. New cumulative debdiff attached.
diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/changelog gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/changelog --- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/changelog 2018-05-22 16:43:40.000000000 +0300 +++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/changelog 2019-02-09 13:29:51.000000000 +0300 @@ -1,3 +1,13 @@ +gf-complete (1.0.2+2017.04.10.git.ea75cdf-3~bpo9+1~local2) stretch-backports; urgency=medium + + * Rebuild for stretch-backports. + * Fix i386 simd compilation. + * Fix runtime cpudetection. + * Fix neon for armhf. + * Run complete test suite under qemu. + + -- Yuriy M. Kaminskiy <yumkam+deb...@gmail.com> Sat, 09 Feb 2019 13:29:51 +0300 + gf-complete (1.0.2+2017.04.10.git.ea75cdf-3) unstable; urgency=medium * remove patch: 0001-temporarily-disable-sse3-and-above.patch (Closes: #899296) diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/control gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/control --- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/control 2018-05-22 16:43:40.000000000 +0300 +++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/control 2019-02-09 13:29:51.000000000 +0300 @@ -7,7 +7,7 @@ Shengjing Zhu <i...@zhsj.me>, Build-Depends: debhelper (>= 10), - qemu-user-static [amd64] <!nocheck>, + qemu-user-static (>= 1:3.1~) [i386 amd64 armhf] <!nocheck>, Standards-Version: 4.1.4 Homepage: http://jerasure.org/ Vcs-Git: https://salsa.debian.org/openstack-team/third-party/gf-complete.git diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0001-Fix-compilation-on-i386.patch gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0001-Fix-compilation-on-i386.patch --- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0001-Fix-compilation-on-i386.patch 1970-01-01 03:00:00.000000000 +0300 +++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0001-Fix-compilation-on-i386.patch 2019-02-09 13:29:51.000000000 +0300 @@ -0,0 +1,27 @@ +From d42fe7d12cdc5f14e1bd9fd13f5d56e2689793dd Mon Sep 17 00:00:00 2001 +From: "Yuriy M. Kaminskiy" <yum...@gmail.com> +Date: Sat, 9 Feb 2019 12:55:11 +0300 +Subject: [PATCH 1/6] Fix compilation on i386 + +--- + include/gf_complete.h | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/include/gf_complete.h b/include/gf_complete.h +index c4783e8..9436eb8 100644 +--- a/include/gf_complete.h ++++ b/include/gf_complete.h +@@ -19,6 +19,10 @@ + #ifdef __SSE4_1__ + #include <smmintrin.h> + #endif ++ #ifdef __i386 ++ #define _mm_insert_epi64(A,B,C) _mm_insert_epi32(_mm_insert_epi32((A),(uint32_t)(B),(C)*2),(uint32_t)((uint64_t)(B)>>32),(C)*2+1) ++ #define _mm_extract_epi64(A,C) ((((uint64_t)_mm_extract_epi32((A),(C)*2+1))<<32)|(uint32_t)_mm_extract_epi32((A),(C)*2)) ++ #endif + #endif + + #ifdef INTEL_SSSE3 +-- +2.11.0 + diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0002-Fix-runtime-cpudetection.patch gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0002-Fix-runtime-cpudetection.patch --- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0002-Fix-runtime-cpudetection.patch 1970-01-01 03:00:00.000000000 +0300 +++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0002-Fix-runtime-cpudetection.patch 2019-02-09 13:29:51.000000000 +0300 @@ -0,0 +1,967 @@ +From b811ddd5cc50b7f3539d80419549f0712b5438a7 Mon Sep 17 00:00:00 2001 +From: "Yuriy M. Kaminskiy" <yum...@gmail.com> +Date: Sat, 9 Feb 2019 13:28:43 +0300 +Subject: [PATCH 2/6] Fix runtime cpudetection + +--- + include/gf_complete.h | 3 +-- + m4/ax_ext.m4 | 19 ++++++++-------- + src/Makefile.am | 5 ++++- + src/gf.c | 60 +++++++++++++++++++++++++++++++++++++++------------ + src/gf_w128.c | 21 ++++++++++++++++++ + src/gf_w16.c | 36 +++++++++++++++++++++++++++++++ + src/gf_w32.c | 45 ++++++++++++++++++++++++++++++++++++++ + src/gf_w4.c | 51 +++++++++++++++++++++++++++++++++++++++++++ + src/gf_w64.c | 33 ++++++++++++++++++++++++++++ + src/gf_w8.c | 33 ++++++++++++++++++++++++++++ + 10 files changed, 280 insertions(+), 26 deletions(-) + +diff --git a/include/gf_complete.h b/include/gf_complete.h +index 9436eb8..0bdf9fc 100644 +--- a/include/gf_complete.h ++++ b/include/gf_complete.h +@@ -15,8 +15,7 @@ + #ifdef INTEL_SSE4 + #ifdef __SSE4_2__ + #include <nmmintrin.h> +- #endif +- #ifdef __SSE4_1__ ++ #else /* def __SSE4_1__ */ + #include <smmintrin.h> + #endif + #ifdef __i386 +diff --git a/m4/ax_ext.m4 b/m4/ax_ext.m4 +index 95c4dbe..e18b678 100644 +--- a/m4/ax_ext.m4 ++++ b/m4/ax_ext.m4 +@@ -14,14 +14,14 @@ AC_DEFUN([AX_EXT], + + AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], [ax_cv_have_neon_ext=yes]) + if test "$ax_cv_have_neon_ext" = yes; then +- AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, [SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON"], [ax_cv_have_neon_ext=no]) ++ AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, [SIMD_FLAGS="$SIMD_FLAGS -DARM_NEON" NEON_FLAGS="-march=armv8-a+simd"], [ax_cv_have_neon_ext=no]) + fi + ;; + + arm*) + AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], [ax_cv_have_neon_ext=yes]) + if test "$ax_cv_have_neon_ext" = yes; then +- AX_CHECK_COMPILE_FLAG(-mfpu=neon, [SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON"], [ax_cv_have_neon_ext=no]) ++ AX_CHECK_COMPILE_FLAG(-mfpu=neon, [SIMD_FLAGS="$SIMD_FLAGS -DARM_NEON" NEON_FLAGS="-mfpu=neon"], [ax_cv_have_neon_ext=no]) + fi + ;; + +@@ -36,40 +36,41 @@ AC_DEFUN([AX_EXT], + + AC_CACHE_CHECK([whether sse is enabled], [ax_cv_have_sse_ext], [ax_cv_have_sse_ext=yes]) + if test "$ax_cv_have_sse_ext" = yes; then +- AX_CHECK_COMPILE_FLAG(-msse, [SIMD_FLAGS="$SIMD_FLAGS -msse -DINTEL_SSE"], [ax_cv_have_sse_ext=no]) ++ AX_CHECK_COMPILE_FLAG(-msse, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE"], [ax_cv_have_sse_ext=no]) + fi + + AC_CACHE_CHECK([whether sse2 is enabled], [ax_cv_have_sse2_ext], [ax_cv_have_sse2_ext=yes]) + if test "$ax_cv_have_sse2_ext" = yes; then +- AX_CHECK_COMPILE_FLAG(-msse2, [SIMD_FLAGS="$SIMD_FLAGS -msse2 -DINTEL_SSE2"], [ax_cv_have_sse2_ext=no]) ++ AX_CHECK_COMPILE_FLAG(-msse2, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE2"], [ax_cv_have_sse2_ext=no]) + fi + + AC_CACHE_CHECK([whether sse3 is enabled], [ax_cv_have_sse3_ext], [ax_cv_have_sse3_ext=yes]) + if test "$ax_cv_have_sse3_ext" = yes; then +- AX_CHECK_COMPILE_FLAG(-msse3, [SIMD_FLAGS="$SIMD_FLAGS -msse3 -DINTEL_SSE3"], [ax_cv_have_sse3_ext=no]) ++ AX_CHECK_COMPILE_FLAG(-msse3, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE3"], [ax_cv_have_sse3_ext=no]) + fi + + AC_CACHE_CHECK([whether ssse3 is enabled], [ax_cv_have_ssse3_ext], [ax_cv_have_ssse3_ext=yes]) + if test "$ax_cv_have_ssse3_ext" = yes; then +- AX_CHECK_COMPILE_FLAG(-mssse3, [SIMD_FLAGS="$SIMD_FLAGS -mssse3 -DINTEL_SSSE3"], [ax_cv_have_ssse3_ext=no]) ++ AX_CHECK_COMPILE_FLAG(-mssse3, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSSE3"], [ax_cv_have_ssse3_ext=no]) + fi + + AC_CACHE_CHECK([whether pclmuldq is enabled], [ax_cv_have_pclmuldq_ext], [ax_cv_have_pclmuldq_ext=yes]) + if test "$ax_cv_have_pclmuldq_ext" = yes; then +- AX_CHECK_COMPILE_FLAG(-mpclmul, [SIMD_FLAGS="$SIMD_FLAGS -mpclmul -DINTEL_SSE4_PCLMUL"], [ax_cv_have_pclmuldq_ext=no]) ++ AX_CHECK_COMPILE_FLAG(-mpclmul, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE4_PCLMUL"], [ax_cv_have_pclmuldq_ext=no]) + fi + + AC_CACHE_CHECK([whether sse4.1 is enabled], [ax_cv_have_sse41_ext], [ax_cv_have_sse41_ext=yes]) + if test "$ax_cv_have_sse41_ext" = yes; then +- AX_CHECK_COMPILE_FLAG(-msse4.1, [SIMD_FLAGS="$SIMD_FLAGS -msse4.1 -DINTEL_SSE4"], [ax_cv_have_sse41_ext=no]) ++ AX_CHECK_COMPILE_FLAG(-msse4.1, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE4"], [ax_cv_have_sse41_ext=no]) + fi + + AC_CACHE_CHECK([whether sse4.2 is enabled], [ax_cv_have_sse42_ext], [ax_cv_have_sse42_ext=yes]) + if test "$ax_cv_have_sse42_ext" = yes; then +- AX_CHECK_COMPILE_FLAG(-msse4.2, [SIMD_FLAGS="$SIMD_FLAGS -msse4.2 -DINTEL_SSE4"], [ax_cv_have_sse42_ext=no]) ++ AX_CHECK_COMPILE_FLAG(-msse4.2, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE4"], [ax_cv_have_sse42_ext=no]) + fi + ;; + esac + + AC_SUBST(SIMD_FLAGS) ++ AC_SUBST(NEON_FLAGS) + ]) +diff --git a/src/Makefile.am b/src/Makefile.am +index cfc2a50..4949497 100644 +--- a/src/Makefile.am ++++ b/src/Makefile.am +@@ -21,7 +21,10 @@ libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare + libgf_complete_la_LIBADD = libgf_util.la + + if HAVE_NEON +-libgf_complete_la_SOURCES += neon/gf_w4_neon.c \ ++noinst_LTLIBRARIES += libgf_neon.la ++libgf_complete_la_LIBADD += libgf_neon.la ++libgf_neon_la_CFLAGS = $(libgf_complete_la_CFLAGS) $(NEON_FLAGS) ++libgf_neon_la_SOURCES = neon/gf_w4_neon.c \ + neon/gf_w8_neon.c \ + neon/gf_w16_neon.c \ + neon/gf_w32_neon.c \ +diff --git a/src/gf.c b/src/gf.c +index 84d6996..3db6acc 100644 +--- a/src/gf.c ++++ b/src/gf.c +@@ -910,23 +910,16 @@ void gf_multby_zero(void *dest, int bytes, int xor) + + static void gf_unaligned_xor(void *src, void *dest, int bytes); + +-void gf_multby_one(void *src, void *dest, int bytes, int xor) ++#ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif ++static void gf_multby_one_sse2(void *src, void *dest, int bytes, int xor) + { + unsigned long uls, uld; + uint8_t *s8, *d8; + uint64_t *s64, *d64, *dtop64; + gf_region_data rd; +- +- if (!xor) { +- if (dest != src) +- memcpy(dest, src, bytes); +- return; +- } +- uls = (unsigned long) src; +- uld = (unsigned long) dest; +- +-#ifdef INTEL_SSE2 +- if (gf_cpu_supports_intel_sse2) { + __m128i ms, md; + int abytes; + s8 = (uint8_t *) src; +@@ -970,10 +963,23 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) + s8++; + } + return; +- } ++} + #endif ++ + #if defined(ARM_NEON) +- if (gf_cpu_supports_arm_neon) { ++#ifndef __ARM_NEON ++#ifdef ARCH_AARCH64 ++__attribute__((target("+simd"))) ++#else ++__attribute__((target("fpu=neon"))) ++#endif ++#endif ++static void gf_multby_one_neon(void *src, void *dest, int bytes, int xor) ++{ ++ unsigned long uls, uld; ++ uint8_t *s8, *d8; ++ uint64_t *s64, *d64, *dtop64; ++ gf_region_data rd; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + +@@ -1008,6 +1014,32 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) + d8++; + } + return; ++} ++#endif ++ ++void gf_multby_one(void *src, void *dest, int bytes, int xor) ++{ ++ unsigned long uls, uld; ++ uint8_t *s8, *d8; ++ uint64_t *s64, *d64, *dtop64; ++ gf_region_data rd; ++ ++ if (!xor) { ++ if (dest != src) ++ memcpy(dest, src, bytes); ++ return; ++ } ++ uls = (unsigned long) src; ++ uld = (unsigned long) dest; ++ ++#ifdef INTEL_SSE2 ++ if (gf_cpu_supports_intel_sse2) { ++ return gf_multby_one_sse2(src, dest, bytes, xor); ++ } ++#endif ++#if defined(ARM_NEON) ++ if (gf_cpu_supports_arm_neon) { ++ return gf_multby_one_neon(src, dest, bytes, xor); + } + #endif + if (uls % 8 != uld % 8) { +diff --git a/src/gf_w128.c b/src/gf_w128.c +index 3bc2d65..ad87f21 100644 +--- a/src/gf_w128.c ++++ b/src/gf_w128.c +@@ -83,6 +83,9 @@ int xor) + } + + #if defined(INTEL_SSE4_PCLMUL) ++#if !defined(__PCLMUL__) ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, +@@ -293,6 +296,9 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12 + + #if defined(INTEL_SSE4_PCLMUL) + ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + void + gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) + { +@@ -377,6 +383,9 @@ gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_ + } + + #if defined(INTEL_SSE4) ++#ifndef __SSE4_1__ ++__attribute__((target("sse4.1"))) ++#endif + void + gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) + { +@@ -434,6 +443,9 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_ + + /* Ben: This slow function implements sse instrutions for bytwo_b because why not */ + #if defined(INTEL_SSE4) ++#ifndef __SSE4_1__ ++__attribute__((target("sse4.1"))) ++#endif + void + gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) + { +@@ -595,6 +607,9 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_ + } + + #if defined(INTEL_SSSE3) && defined(INTEL_SSE4) ++#if !defined(__SSSE3__) || !defined(__SSE4_1__) ++__attribute__((target("ssse3,sse4.1"))) ++#endif + static + void + gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +@@ -693,6 +708,9 @@ gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_ + #endif + + #if defined(INTEL_SSSE3) && defined(INTEL_SSE4) ++#if !defined(__SSSE3__) || !defined(__SSE4_1__) ++__attribute__((target("ssse3,sse4.1"))) ++#endif + static + void + gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +@@ -1491,6 +1509,9 @@ void gf_w128_group_r_init(gf_t *gf) + } + + #if 0 // defined(INTEL_SSE4) ++#ifndef __SSE4_1__ ++__attribute__((target("sse4.1"))) ++#endif + static + void gf_w128_group_r_sse_init(gf_t *gf) + { +diff --git a/src/gf_w16.c b/src/gf_w16.c +index 8316892..c683c4e 100644 +--- a/src/gf_w16.c ++++ b/src/gf_w16.c +@@ -80,6 +80,9 @@ gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t + } + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -143,6 +146,9 @@ gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val + #endif + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -211,6 +217,9 @@ gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val + #endif + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -393,6 +402,9 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b) + */ + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -438,6 +450,9 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) + #endif + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -476,6 +491,9 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) + #endif + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -945,6 +963,9 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v + } + + #ifdef INTEL_SSSE3 ++#ifndef __SSSE3__ ++__attribute__((target("ssse3"))) ++#endif + static + void + gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -1078,6 +1099,9 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v + #endif + + #ifdef INTEL_SSSE3 ++#ifndef __SSSE3__ ++__attribute__((target("ssse3"))) ++#endif + static + void + gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -1503,6 +1527,9 @@ gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ + v = _mm_srli_epi64(v, 1); } + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -1566,6 +1593,9 @@ gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) +@@ -1591,6 +1621,9 @@ gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data * + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) +@@ -1619,6 +1652,9 @@ gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *bt + + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +diff --git a/src/gf_w32.c b/src/gf_w32.c +index 976b68b..a20e806 100644 +--- a/src/gf_w32.c ++++ b/src/gf_w32.c +@@ -71,6 +71,9 @@ xor) + + #if defined(INTEL_SSE4_PCLMUL) + ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +@@ -121,6 +124,9 @@ gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32 + + #if defined(INTEL_SSE4_PCLMUL) + ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +@@ -175,6 +181,9 @@ gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32 + #endif + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +@@ -350,6 +359,9 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b) + + #if defined(INTEL_SSE4_PCLMUL) + ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -385,6 +397,9 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) + + #if defined(INTEL_SSE4_PCLMUL) + ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +@@ -438,6 +453,9 @@ gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32 + + #if defined(INTEL_SSE4_PCLMUL) + ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -483,6 +501,9 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) + + #if defined(INTEL_SSE4_PCLMUL) + ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -522,6 +543,9 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) + + #if defined(INTEL_SSE4_PCLMUL) + ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -1057,6 +1081,9 @@ gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ + v = _mm_srli_epi64(v, 1); } + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -1252,6 +1279,9 @@ gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ + } + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) +@@ -1277,6 +1307,9 @@ gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data * + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) +@@ -1305,6 +1338,9 @@ gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *bt + + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -1627,6 +1663,9 @@ gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t + } + + #ifdef INTEL_SSSE3 ++#ifndef __SSSE3__ ++__attribute__((target("ssse3"))) ++#endif + static + void + gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +@@ -1763,6 +1802,9 @@ gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t + } + + #ifdef INTEL_SSSE3 ++#ifndef __SSSE3__ ++__attribute__((target("ssse3"))) ++#endif + static + void + gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +@@ -1954,6 +1996,9 @@ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des + + + #ifdef INTEL_SSSE3 ++#ifndef __SSSE3__ ++__attribute__((target("ssse3"))) ++#endif + static + void + gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +diff --git a/src/gf_w4.c b/src/gf_w4.c +index 3a7b953..fedba30 100644 +--- a/src/gf_w4.c ++++ b/src/gf_w4.c +@@ -136,6 +136,9 @@ gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) + /* Ben: This function works, but it is 33% slower than the normal shift mult */ + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -374,6 +377,9 @@ gf_w4_single_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t + #define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", blah[i]); printf("\n"); } + + #ifdef INTEL_SSSE3 ++#ifndef __SSSE3__ ++__attribute__((target("ssse3"))) ++#endif + static + void + gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -877,6 +883,9 @@ gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t + v = _mm_srli_epi64(v, 1); } + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -928,6 +937,9 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v + + /* + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -993,6 +1005,9 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v + */ + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1017,6 +1032,9 @@ gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1043,6 +1061,9 @@ gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1068,6 +1089,9 @@ gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1096,6 +1120,9 @@ gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd) + + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1122,6 +1149,9 @@ gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1148,6 +1178,9 @@ gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1175,6 +1208,9 @@ gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1202,6 +1238,9 @@ gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1230,6 +1269,9 @@ gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1258,6 +1300,9 @@ gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1285,6 +1330,9 @@ gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +@@ -1312,6 +1360,9 @@ gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +diff --git a/src/gf_w64.c b/src/gf_w64.c +index 69e55db..ff78ad1 100644 +--- a/src/gf_w64.c ++++ b/src/gf_w64.c +@@ -58,6 +58,9 @@ xor) + } + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int +@@ -145,6 +148,9 @@ xor) + #endif + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int +@@ -341,6 +347,9 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) + + #if defined(INTEL_SSE4_PCLMUL) + ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_64_t +@@ -383,6 +392,9 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) + + #if defined(INTEL_SSE4_PCLMUL) + ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_64_t +@@ -425,6 +437,9 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) + + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + void + gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) + { +@@ -1265,6 +1280,9 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_ + + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) + { + int i; +@@ -1329,6 +1347,9 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_ + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) +@@ -1362,6 +1383,9 @@ gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) +@@ -1393,6 +1417,9 @@ gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +@@ -1671,6 +1698,9 @@ int gf_w64_composite_init(gf_t *gf) + } + + #ifdef INTEL_SSSE3 ++#ifndef __SSSE3__ ++__attribute__((target("ssse3"))) ++#endif + static + void + gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +@@ -1755,6 +1785,9 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des + #endif + + #ifdef INTEL_SSE4 ++#ifndef __SSE4_1__ ++__attribute__((target("sse4.1"))) ++#endif + static + void + gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +diff --git a/src/gf_w8.c b/src/gf_w8.c +index f647a31..d226925 100644 +--- a/src/gf_w8.c ++++ b/src/gf_w8.c +@@ -129,6 +129,9 @@ uint32_t gf_w8_matrix (gf_t *gf, uint32_t b) + + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -174,6 +177,9 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) + #endif + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -212,6 +218,9 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) + #endif + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + inline + gf_val_32_t +@@ -286,6 +295,9 @@ gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t v + } + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int +@@ -344,6 +356,9 @@ gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_ + #endif + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int +@@ -406,6 +421,9 @@ gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_ + #endif + + #if defined(INTEL_SSE4_PCLMUL) ++#ifndef __PCLMUL__ ++__attribute__((target("pclmul,sse4.1"))) ++#endif + static + void + gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int +@@ -981,6 +999,9 @@ gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, in + } + + #ifdef INTEL_SSSE3 ++#ifndef __SSSE3__ ++__attribute__((target("ssse3"))) ++#endif + static + void + gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -1606,6 +1627,9 @@ gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t + v = _mm_srli_epi64(v, 1); } + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +@@ -1661,6 +1685,9 @@ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) +@@ -1686,6 +1713,9 @@ gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *bt + #endif + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) +@@ -1714,6 +1744,9 @@ gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) + + + #ifdef INTEL_SSE2 ++#ifndef __SSE2__ ++__attribute__((target("sse2"))) ++#endif + static + void + gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +-- +2.11.0 + diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0003-Enable-runtime-cpudetection-on-i386.patch gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0003-Enable-runtime-cpudetection-on-i386.patch --- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0003-Enable-runtime-cpudetection-on-i386.patch 1970-01-01 03:00:00.000000000 +0300 +++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0003-Enable-runtime-cpudetection-on-i386.patch 2019-02-09 13:29:51.000000000 +0300 @@ -0,0 +1,25 @@ +From 03ea8244d34a2800747bac0f0a4355a017f31a7b Mon Sep 17 00:00:00 2001 +From: "Yuriy M. Kaminskiy" <yum...@gmail.com> +Date: Sun, 10 Feb 2019 11:54:23 +0300 +Subject: [PATCH 3/6] Enable runtime cpudetection on i386 + +--- + src/gf_cpu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/gf_cpu.c b/src/gf_cpu.c +index f65131f..e113051 100644 +--- a/src/gf_cpu.c ++++ b/src/gf_cpu.c +@@ -20,7 +20,7 @@ int gf_cpu_supports_intel_sse3 = 0; + int gf_cpu_supports_intel_sse2 = 0; + int gf_cpu_supports_arm_neon = 0; + +-#if defined(__x86_64__) ++#if defined(__x86_64__) || defined(__i386) + + /* CPUID Feature Bits */ + +-- +2.11.0 + diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0004-Merge-back-unneeded-libgf_util.la.patch gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0004-Merge-back-unneeded-libgf_util.la.patch --- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0004-Merge-back-unneeded-libgf_util.la.patch 1970-01-01 03:00:00.000000000 +0300 +++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0004-Merge-back-unneeded-libgf_util.la.patch 2019-02-09 13:29:51.000000000 +0300 @@ -0,0 +1,42 @@ +From 44b18e4748b1d5c422d4517648c9bb1267c07bfc Mon Sep 17 00:00:00 2001 +From: "Yuriy M. Kaminskiy" <yum...@gmail.com> +Date: Sat, 9 Feb 2019 18:56:09 +0300 +Subject: [PATCH 4/6] Merge back unneeded libgf_util.la + +As SIMD_FLAGS contains only preprocessor macros, splitting off +gf_method.c compilation is no longer needed for runtime cpu detection. +--- + src/Makefile.am | 14 +++----------- + 1 file changed, 3 insertions(+), 11 deletions(-) + +diff --git a/src/Makefile.am b/src/Makefile.am +index 4949497..5e7dafc 100644 +--- a/src/Makefile.am ++++ b/src/Makefile.am +@@ -5,20 +5,12 @@ AUTOMAKE_OPTIONS = subdir-objects + + AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include + +-# avoid using SIMD_FLAGS for code that calls strcmp as new gcc +-# versions will use SIMD for the strcmp implementation. Instead +-# we create a static library just for gf_method that is not compiled +-# with SIMD_FLAGS, this static library will get linked into gf_complete.so +-noinst_LTLIBRARIES = libgf_util.la +-libgf_util_la_SOURCES = gf_method.c +-libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare +- +-# we narrowly use SIMD_FLAGS for code that needs it ++noinst_LTLIBRARIES = + lib_LTLIBRARIES = libgf_complete.la + libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \ +- gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c ++ gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c gf_method.c + libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare +-libgf_complete_la_LIBADD = libgf_util.la ++libgf_complete_la_LIBADD = + + if HAVE_NEON + noinst_LTLIBRARIES += libgf_neon.la +-- +2.11.0 + diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0007-hack-to-run-testsuite-in-qemu.patch gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0007-hack-to-run-testsuite-in-qemu.patch --- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0007-hack-to-run-testsuite-in-qemu.patch 1970-01-01 03:00:00.000000000 +0300 +++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0007-hack-to-run-testsuite-in-qemu.patch 2019-02-09 13:29:51.000000000 +0300 @@ -0,0 +1,26 @@ +From d33f9e3a0f47310145c6b8600a2df66bbd045b01 Mon Sep 17 00:00:00 2001 +From: "Yuriy M. Kaminskiy" <yum...@gmail.com> +Date: Sun, 10 Feb 2019 20:06:20 +0300 +Subject: [PATCH 7/7] hack to run testsuite in qemu + +Not for upstream. +--- + tools/Makefile.am | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/Makefile.am b/tools/Makefile.am +index 4ca9131..ed2717d 100644 +--- a/tools/Makefile.am ++++ b/tools/Makefile.am +@@ -40,7 +40,7 @@ endif + + # gf_unit tests as generated by gf_methods + gf_unit_w%.sh: gf_methods +- ./$^ $(@:gf_unit_w%.sh=%) -A -U ${VALGRIND} > $@ || rm $@ ++ ${QEMU_CMD}./$^ $(@:gf_unit_w%.sh=%) -A -U ${VALGRIND} | perl -p -e 's|^\.\.|${QEMU_CMD}$$&|' > $@ || rm $@ + + TESTS = gf_unit_w128.sh \ + gf_unit_w64.sh \ +-- +2.11.0 + diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/series gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/series --- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/series 1970-01-01 03:00:00.000000000 +0300 +++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/series 2019-02-09 13:29:51.000000000 +0300 @@ -0,0 +1,5 @@ +0001-Fix-compilation-on-i386.patch +0002-Fix-runtime-cpudetection.patch +0003-Enable-runtime-cpudetection-on-i386.patch +0004-Merge-back-unneeded-libgf_util.la.patch +0007-hack-to-run-testsuite-in-qemu.patch diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/rules gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/rules --- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/rules 2018-05-22 16:43:40.000000000 +0300 +++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/rules 2019-02-09 13:29:51.000000000 +0300 @@ -6,23 +6,43 @@ GIT_TAG ?= $(shell echo '$(DEB_VERSION_UPSTREAM)' | sed -e 's/~/_/') -%: - dh $@ --with autoreconf +ifeq ($(DEB_HOST_ARCH), amd64) +QEMU_ARCH=x86_64 +# omitted: sse2 (always on amd64), sse3 (not used) +QEMU_CPUS=qemu64,-ssse3,-sse4.1,-sse4.2,-pclmulqdq +endif -ifeq ($(DEB_HOST_ARCH_BITS), 32) -override_dh_auto_configure: - dh_auto_configure -- --disable-sse --disable-neon +ifeq ($(DEB_HOST_ARCH), i386) +QEMU_ARCH=i386 +# omitted: sse3 (not used) +QEMU_CPUS=qemu32,-sse2,-ssse3,-sse4.1,-sse4.2,-pclmulqdq endif -ifeq ($(DEB_HOST_ARCH), amd64) +ifeq ($(DEB_HOST_ARCH), armhf) +ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS))) +ifeq (neon,$(shell grep -o -w -h -m 1 neon /proc/cpuinfo 2>/dev/null)) + $(warning NEON detected on the build host, arm-without-neon configuration was not tested) +else +QEMU_ARCH=arm +QEMU_CPUS=cortex-a8 +endif +endif +endif + +%: + dh $@ --with autoreconf + +ifneq (,$(QEMU_ARCH)) ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS))) override_dh_auto_test: dh_auto_test - ./libtool --mode=execute qemu-x86_64-static -cpu qemu64,-sse3,-ssse3,-sse4.1,-sse4.2 ./test/gf_unit 64 A -1 - - ./libtool --mode=execute qemu-x86_64-static -cpu qemu64,+sse3,-ssse3,-sse4.1,-sse4.2 ./test/gf_unit 64 A -1 - - ./libtool --mode=execute qemu-x86_64-static -cpu qemu64,+sse3,+ssse3,-sse4.1,-sse4.2 ./test/gf_unit 64 A -1 - - ./libtool --mode=execute qemu-x86_64-static -cpu qemu64,+sse3,+ssse3,+sse4.1,-sse4.2 ./test/gf_unit 64 A -1 - - ./libtool --mode=execute qemu-x86_64-static -cpu qemu64,+sse3,+ssse3,+sse4.1,+sse4.2 ./test/gf_unit 64 A -1 - + set -ex; C=$(QEMU_ARCH); c=$(QEMU_CPUS); p=X; \ + while test "$$p" != "$$c"; do \ + export QEMU_CMD="$(CURDIR)/libtool --mode=execute qemu-$$C-static -cpu $$c " ; \ + dh_auto_test; \ + p="$$c"; \ + c="`echo $$c|sed 's/,-/,+/'`"; \ + done endif endif