On 10.02.2019 10:33, Yuriy M. Kaminskiy wrote:
> On 09.02.2019 20:48, Yuriy M. Kaminskiy wrote:
>> 2) it can expose bugs on some cpus, that are not caught by testsuite
>> (e.g. [previous version of] my _mm_extract_epi64 replacement was very
>> buggy - but it was not detected by debian's qemu test [as it only runs
>> only single unit test for gf64, but only gf128 was affected]);
> 
> Just in case, I (ab)used valgrind hook to run complete testsuite for all
> combination of flags for i386 and amd64, incremental debdiff attached (on the 
> top of above).
> 
> Known problems: takes a bit too long to run (2.5 hours for amd64, and even 
> more for i386).

Unfortunately, after playing a bit,
1) runtime cpu detection on i386 was not enabled in sources (so, [on i386] all 
simd code was
compiled, but not used); new patch added;
2) qemu-user in stretch does not parse -cpu in a way this test expect it to (so 
my tests was flawed); I added B-D on version in buster.
3) with (1) & (2) fixed, I found that i386 sse4/pclmul was broken - due to my 
mistake (_mm_extract_epi64 emulation was broken; caught by unit-test); patch 
0001* updated;
4) with (3) fixed, I discovered that my qemu-for-complete-testsuite patch was 
flawed - gf_method should run on same "cpu" as gf_unit (some tests are 
activated only on supporting cpu); see patch 0007* (not for upstream, too 
"hacky") and debian/rules update.

I rebuild and retested package on i386 and amd64, everything seems fine.

New cumulative debdiff attached.
diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/changelog 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/changelog
--- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/changelog   2018-05-22 
16:43:40.000000000 +0300
+++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/changelog   2019-02-09 
13:29:51.000000000 +0300
@@ -1,3 +1,13 @@
+gf-complete (1.0.2+2017.04.10.git.ea75cdf-3~bpo9+1~local2) stretch-backports; 
urgency=medium
+
+  * Rebuild for stretch-backports.
+  * Fix i386 simd compilation.
+  * Fix runtime cpudetection.
+  * Fix neon for armhf.
+  * Run complete test suite under qemu.
+
+ -- Yuriy M. Kaminskiy <yumkam+deb...@gmail.com>  Sat, 09 Feb 2019 13:29:51 
+0300
+
 gf-complete (1.0.2+2017.04.10.git.ea75cdf-3) unstable; urgency=medium
 
   * remove patch: 0001-temporarily-disable-sse3-and-above.patch (Closes: 
#899296)
diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/control 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/control
--- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/control     2018-05-22 
16:43:40.000000000 +0300
+++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/control     2019-02-09 
13:29:51.000000000 +0300
@@ -7,7 +7,7 @@
  Shengjing Zhu <i...@zhsj.me>,
 Build-Depends:
  debhelper (>= 10),
- qemu-user-static [amd64] <!nocheck>,
+ qemu-user-static (>= 1:3.1~) [i386 amd64 armhf] <!nocheck>,
 Standards-Version: 4.1.4
 Homepage: http://jerasure.org/
 Vcs-Git: https://salsa.debian.org/openstack-team/third-party/gf-complete.git
diff -Nru 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0001-Fix-compilation-on-i386.patch
 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0001-Fix-compilation-on-i386.patch
--- 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0001-Fix-compilation-on-i386.patch
  1970-01-01 03:00:00.000000000 +0300
+++ 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0001-Fix-compilation-on-i386.patch
  2019-02-09 13:29:51.000000000 +0300
@@ -0,0 +1,27 @@
+From d42fe7d12cdc5f14e1bd9fd13f5d56e2689793dd Mon Sep 17 00:00:00 2001
+From: "Yuriy M. Kaminskiy" <yum...@gmail.com>
+Date: Sat, 9 Feb 2019 12:55:11 +0300
+Subject: [PATCH 1/6] Fix compilation on i386
+
+---
+ include/gf_complete.h | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/include/gf_complete.h b/include/gf_complete.h
+index c4783e8..9436eb8 100644
+--- a/include/gf_complete.h
++++ b/include/gf_complete.h
+@@ -19,6 +19,10 @@
+   #ifdef __SSE4_1__
+     #include <smmintrin.h>
+   #endif
++  #ifdef __i386
++    #define _mm_insert_epi64(A,B,C) 
_mm_insert_epi32(_mm_insert_epi32((A),(uint32_t)(B),(C)*2),(uint32_t)((uint64_t)(B)>>32),(C)*2+1)
++    #define _mm_extract_epi64(A,C) 
((((uint64_t)_mm_extract_epi32((A),(C)*2+1))<<32)|(uint32_t)_mm_extract_epi32((A),(C)*2))
++  #endif
+ #endif
+ 
+ #ifdef INTEL_SSSE3
+-- 
+2.11.0
+
diff -Nru 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0002-Fix-runtime-cpudetection.patch
 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0002-Fix-runtime-cpudetection.patch
--- 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0002-Fix-runtime-cpudetection.patch
 1970-01-01 03:00:00.000000000 +0300
+++ 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0002-Fix-runtime-cpudetection.patch
 2019-02-09 13:29:51.000000000 +0300
@@ -0,0 +1,967 @@
+From b811ddd5cc50b7f3539d80419549f0712b5438a7 Mon Sep 17 00:00:00 2001
+From: "Yuriy M. Kaminskiy" <yum...@gmail.com>
+Date: Sat, 9 Feb 2019 13:28:43 +0300
+Subject: [PATCH 2/6] Fix runtime cpudetection
+
+---
+ include/gf_complete.h |  3 +--
+ m4/ax_ext.m4          | 19 ++++++++--------
+ src/Makefile.am       |  5 ++++-
+ src/gf.c              | 60 +++++++++++++++++++++++++++++++++++++++------------
+ src/gf_w128.c         | 21 ++++++++++++++++++
+ src/gf_w16.c          | 36 +++++++++++++++++++++++++++++++
+ src/gf_w32.c          | 45 ++++++++++++++++++++++++++++++++++++++
+ src/gf_w4.c           | 51 +++++++++++++++++++++++++++++++++++++++++++
+ src/gf_w64.c          | 33 ++++++++++++++++++++++++++++
+ src/gf_w8.c           | 33 ++++++++++++++++++++++++++++
+ 10 files changed, 280 insertions(+), 26 deletions(-)
+
+diff --git a/include/gf_complete.h b/include/gf_complete.h
+index 9436eb8..0bdf9fc 100644
+--- a/include/gf_complete.h
++++ b/include/gf_complete.h
+@@ -15,8 +15,7 @@
+ #ifdef INTEL_SSE4
+   #ifdef __SSE4_2__
+     #include <nmmintrin.h>
+-  #endif
+-  #ifdef __SSE4_1__
++  #else /* def __SSE4_1__ */
+     #include <smmintrin.h>
+   #endif
+   #ifdef __i386
+diff --git a/m4/ax_ext.m4 b/m4/ax_ext.m4
+index 95c4dbe..e18b678 100644
+--- a/m4/ax_ext.m4
++++ b/m4/ax_ext.m4
+@@ -14,14 +14,14 @@ AC_DEFUN([AX_EXT],
+ 
+       AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], 
[ax_cv_have_neon_ext=yes])
+       if test "$ax_cv_have_neon_ext" = yes; then
+-        AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, [SIMD_FLAGS="$SIMD_FLAGS 
-march=armv8-a+simd -DARM_NEON"], [ax_cv_have_neon_ext=no])
++        AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, [SIMD_FLAGS="$SIMD_FLAGS 
-DARM_NEON" NEON_FLAGS="-march=armv8-a+simd"], [ax_cv_have_neon_ext=no])
+       fi
+       ;;
+ 
+     arm*)
+       AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], 
[ax_cv_have_neon_ext=yes])
+       if test "$ax_cv_have_neon_ext" = yes; then
+-        AX_CHECK_COMPILE_FLAG(-mfpu=neon, [SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon 
-DARM_NEON"], [ax_cv_have_neon_ext=no])
++        AX_CHECK_COMPILE_FLAG(-mfpu=neon, [SIMD_FLAGS="$SIMD_FLAGS 
-DARM_NEON" NEON_FLAGS="-mfpu=neon"], [ax_cv_have_neon_ext=no])
+       fi
+       ;;
+ 
+@@ -36,40 +36,41 @@ AC_DEFUN([AX_EXT],
+ 
+       AC_CACHE_CHECK([whether sse is enabled], [ax_cv_have_sse_ext], 
[ax_cv_have_sse_ext=yes])
+       if test "$ax_cv_have_sse_ext" = yes; then
+-        AX_CHECK_COMPILE_FLAG(-msse, [SIMD_FLAGS="$SIMD_FLAGS -msse 
-DINTEL_SSE"], [ax_cv_have_sse_ext=no])
++        AX_CHECK_COMPILE_FLAG(-msse, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE"], 
[ax_cv_have_sse_ext=no])
+       fi
+ 
+       AC_CACHE_CHECK([whether sse2 is enabled], [ax_cv_have_sse2_ext], 
[ax_cv_have_sse2_ext=yes])
+       if test "$ax_cv_have_sse2_ext" = yes; then
+-        AX_CHECK_COMPILE_FLAG(-msse2, [SIMD_FLAGS="$SIMD_FLAGS -msse2 
-DINTEL_SSE2"], [ax_cv_have_sse2_ext=no])
++        AX_CHECK_COMPILE_FLAG(-msse2, [SIMD_FLAGS="$SIMD_FLAGS 
-DINTEL_SSE2"], [ax_cv_have_sse2_ext=no])
+       fi
+ 
+       AC_CACHE_CHECK([whether sse3 is enabled], [ax_cv_have_sse3_ext], 
[ax_cv_have_sse3_ext=yes])
+       if test "$ax_cv_have_sse3_ext" = yes; then
+-        AX_CHECK_COMPILE_FLAG(-msse3, [SIMD_FLAGS="$SIMD_FLAGS -msse3 
-DINTEL_SSE3"], [ax_cv_have_sse3_ext=no])
++        AX_CHECK_COMPILE_FLAG(-msse3, [SIMD_FLAGS="$SIMD_FLAGS 
-DINTEL_SSE3"], [ax_cv_have_sse3_ext=no])
+       fi
+ 
+       AC_CACHE_CHECK([whether ssse3 is enabled], [ax_cv_have_ssse3_ext], 
[ax_cv_have_ssse3_ext=yes])
+       if test "$ax_cv_have_ssse3_ext" = yes; then
+-        AX_CHECK_COMPILE_FLAG(-mssse3, [SIMD_FLAGS="$SIMD_FLAGS -mssse3 
-DINTEL_SSSE3"], [ax_cv_have_ssse3_ext=no])
++        AX_CHECK_COMPILE_FLAG(-mssse3, [SIMD_FLAGS="$SIMD_FLAGS 
-DINTEL_SSSE3"], [ax_cv_have_ssse3_ext=no])
+       fi
+ 
+       AC_CACHE_CHECK([whether pclmuldq is enabled], 
[ax_cv_have_pclmuldq_ext], [ax_cv_have_pclmuldq_ext=yes])
+       if test "$ax_cv_have_pclmuldq_ext" = yes; then
+-        AX_CHECK_COMPILE_FLAG(-mpclmul, [SIMD_FLAGS="$SIMD_FLAGS -mpclmul 
-DINTEL_SSE4_PCLMUL"], [ax_cv_have_pclmuldq_ext=no])
++        AX_CHECK_COMPILE_FLAG(-mpclmul, [SIMD_FLAGS="$SIMD_FLAGS 
-DINTEL_SSE4_PCLMUL"], [ax_cv_have_pclmuldq_ext=no])
+       fi
+ 
+       AC_CACHE_CHECK([whether sse4.1 is enabled], [ax_cv_have_sse41_ext], 
[ax_cv_have_sse41_ext=yes])
+       if test "$ax_cv_have_sse41_ext" = yes; then
+-        AX_CHECK_COMPILE_FLAG(-msse4.1, [SIMD_FLAGS="$SIMD_FLAGS -msse4.1 
-DINTEL_SSE4"], [ax_cv_have_sse41_ext=no])
++        AX_CHECK_COMPILE_FLAG(-msse4.1, [SIMD_FLAGS="$SIMD_FLAGS 
-DINTEL_SSE4"], [ax_cv_have_sse41_ext=no])
+       fi
+ 
+       AC_CACHE_CHECK([whether sse4.2 is enabled], [ax_cv_have_sse42_ext], 
[ax_cv_have_sse42_ext=yes])
+       if test "$ax_cv_have_sse42_ext" = yes; then
+-        AX_CHECK_COMPILE_FLAG(-msse4.2, [SIMD_FLAGS="$SIMD_FLAGS -msse4.2 
-DINTEL_SSE4"], [ax_cv_have_sse42_ext=no])
++        AX_CHECK_COMPILE_FLAG(-msse4.2, [SIMD_FLAGS="$SIMD_FLAGS 
-DINTEL_SSE4"], [ax_cv_have_sse42_ext=no])
+       fi
+       ;;
+   esac
+ 
+   AC_SUBST(SIMD_FLAGS)
++  AC_SUBST(NEON_FLAGS)
+ ])
+diff --git a/src/Makefile.am b/src/Makefile.am
+index cfc2a50..4949497 100644
+--- a/src/Makefile.am
++++ b/src/Makefile.am
+@@ -21,7 +21,10 @@ libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC 
-Wsign-compare
+ libgf_complete_la_LIBADD = libgf_util.la
+ 
+ if HAVE_NEON
+-libgf_complete_la_SOURCES += neon/gf_w4_neon.c  \
++noinst_LTLIBRARIES += libgf_neon.la
++libgf_complete_la_LIBADD += libgf_neon.la
++libgf_neon_la_CFLAGS = $(libgf_complete_la_CFLAGS) $(NEON_FLAGS)
++libgf_neon_la_SOURCES = neon/gf_w4_neon.c  \
+                              neon/gf_w8_neon.c  \
+                              neon/gf_w16_neon.c \
+                              neon/gf_w32_neon.c \
+diff --git a/src/gf.c b/src/gf.c
+index 84d6996..3db6acc 100644
+--- a/src/gf.c
++++ b/src/gf.c
+@@ -910,23 +910,16 @@ void gf_multby_zero(void *dest, int bytes, int xor)
+ 
+ static void gf_unaligned_xor(void *src, void *dest, int bytes);
+ 
+-void gf_multby_one(void *src, void *dest, int bytes, int xor) 
++#ifdef   INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
++static void gf_multby_one_sse2(void *src, void *dest, int bytes, int xor) 
+ {
+   unsigned long uls, uld;
+   uint8_t *s8, *d8;
+   uint64_t *s64, *d64, *dtop64;
+   gf_region_data rd;
+-
+-  if (!xor) {
+-    if (dest != src)
+-      memcpy(dest, src, bytes);
+-    return;
+-  }
+-  uls = (unsigned long) src;
+-  uld = (unsigned long) dest;
+-
+-#ifdef   INTEL_SSE2
+-  if (gf_cpu_supports_intel_sse2) {
+     __m128i ms, md;
+     int abytes;
+     s8 = (uint8_t *) src;
+@@ -970,10 +963,23 @@ void gf_multby_one(void *src, void *dest, int bytes, int 
xor)
+       s8++;
+     }
+     return;
+-  }
++}
+ #endif
++
+ #if defined(ARM_NEON)
+-  if (gf_cpu_supports_arm_neon) {
++#ifndef __ARM_NEON
++#ifdef ARCH_AARCH64
++__attribute__((target("+simd")))
++#else
++__attribute__((target("fpu=neon")))
++#endif
++#endif
++static void gf_multby_one_neon(void *src, void *dest, int bytes, int xor) 
++{
++  unsigned long uls, uld;
++  uint8_t *s8, *d8;
++  uint64_t *s64, *d64, *dtop64;
++  gf_region_data rd;
+     s8 = (uint8_t *) src;
+     d8 = (uint8_t *) dest;
+ 
+@@ -1008,6 +1014,32 @@ void gf_multby_one(void *src, void *dest, int bytes, 
int xor)
+       d8++;
+     }
+     return;
++}
++#endif
++
++void gf_multby_one(void *src, void *dest, int bytes, int xor) 
++{
++  unsigned long uls, uld;
++  uint8_t *s8, *d8;
++  uint64_t *s64, *d64, *dtop64;
++  gf_region_data rd;
++
++  if (!xor) {
++    if (dest != src)
++      memcpy(dest, src, bytes);
++    return;
++  }
++  uls = (unsigned long) src;
++  uld = (unsigned long) dest;
++
++#ifdef   INTEL_SSE2
++  if (gf_cpu_supports_intel_sse2) {
++    return gf_multby_one_sse2(src, dest, bytes, xor);
++  }
++#endif
++#if defined(ARM_NEON)
++  if (gf_cpu_supports_arm_neon) {
++    return gf_multby_one_neon(src, dest, bytes, xor);
+   }
+ #endif
+   if (uls % 8 != uld % 8) {
+diff --git a/src/gf_w128.c b/src/gf_w128.c
+index 3bc2d65..ad87f21 100644
+--- a/src/gf_w128.c
++++ b/src/gf_w128.c
+@@ -83,6 +83,9 @@ int xor)
+ }
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#if !defined(__PCLMUL__)
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ void
+ gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, 
gf_val_128_t val, int bytes,
+@@ -293,6 +296,9 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, 
gf_val_128_t b128, gf_val_12
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
+ 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ void
+ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, 
gf_val_128_t c128)
+ {
+@@ -377,6 +383,9 @@ gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, 
gf_val_128_t b128, gf_val_
+ }
+ 
+ #if defined(INTEL_SSE4)
++#ifndef __SSE4_1__
++__attribute__((target("sse4.1")))
++#endif
+ void
+ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, 
gf_val_128_t c128)
+ {
+@@ -434,6 +443,9 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, 
gf_val_128_t b128, gf_
+ 
+ /* Ben: This slow function implements sse instrutions for bytwo_b because why 
not */
+ #if defined(INTEL_SSE4)
++#ifndef __SSE4_1__
++__attribute__((target("sse4.1")))
++#endif
+ void
+ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, 
gf_val_128_t c128)
+ {
+@@ -595,6 +607,9 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, 
void *dest, gf_val_128_
+ }
+ 
+ #if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
++#if !defined(__SSSE3__) || !defined(__SSE4_1__)
++__attribute__((target("ssse3,sse4.1")))
++#endif
+ static
+ void
+ gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_128_t val, int bytes, int xor)
+@@ -693,6 +708,9 @@ gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void 
*src, void *dest, gf_val_
+ #endif
+ 
+ #if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
++#if !defined(__SSSE3__) || !defined(__SSE4_1__)
++__attribute__((target("ssse3,sse4.1")))
++#endif
+ static
+ void
+ gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void 
*dest, gf_val_128_t val, int bytes, int xor)
+@@ -1491,6 +1509,9 @@ void gf_w128_group_r_init(gf_t *gf)
+ }
+ 
+ #if 0 // defined(INTEL_SSE4)
++#ifndef __SSE4_1__
++__attribute__((target("sse4.1")))
++#endif
+   static
+ void gf_w128_group_r_sse_init(gf_t *gf)
+ {
+diff --git a/src/gf_w16.c b/src/gf_w16.c
+index 8316892..c683c4e 100644
+--- a/src/gf_w16.c
++++ b/src/gf_w16.c
+@@ -80,6 +80,9 @@ gf_w16_multiply_region_from_single(gf_t *gf, void *src, void 
*dest, gf_val_32_t
+ }
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ void
+ gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -143,6 +146,9 @@ gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void 
*src, void *dest, gf_val
+ #endif
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ void
+ gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -211,6 +217,9 @@ gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void 
*src, void *dest, gf_val
+ #endif
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ void
+ gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -393,6 +402,9 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
+  */
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -438,6 +450,9 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, 
gf_val_32_t b16)
+ #endif
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -476,6 +491,9 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, 
gf_val_32_t b16)
+ #endif
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -945,6 +963,9 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, 
void *dest, gf_val_32_t v
+ }
+ 
+ #ifdef INTEL_SSSE3
++#ifndef __SSSE3__
++__attribute__((target("ssse3")))
++#endif
+ static
+ void
+ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -1078,6 +1099,9 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, 
void *src, void *dest, gf_v
+ #endif
+ 
+ #ifdef INTEL_SSSE3
++#ifndef __SSSE3__
++__attribute__((target("ssse3")))
++#endif
+ static
+ void
+ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void 
*dest, gf_val_32_t val, int bytes, int xor)
+@@ -1503,6 +1527,9 @@ gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void 
*src, void *dest, gf_val_32_
+       v = _mm_srli_epi64(v, 1); }
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void 
+ gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -1566,6 +1593,9 @@ gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, 
void *dest, gf_val_32_t
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void
+ gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct 
gf_w16_bytwo_data *btd)
+@@ -1591,6 +1621,9 @@ gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, 
struct gf_w16_bytwo_data *
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void
+ gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data 
*btd)
+@@ -1619,6 +1652,9 @@ gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, 
struct gf_w16_bytwo_data *bt
+ 
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void 
+ gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+diff --git a/src/gf_w32.c b/src/gf_w32.c
+index 976b68b..a20e806 100644
+--- a/src/gf_w32.c
++++ b/src/gf_w32.c
+@@ -71,6 +71,9 @@ xor)
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
+ 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static 
+ void
+ gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, 
uint32_t val, int bytes, int xor)
+@@ -121,6 +124,9 @@ gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void 
*src, void *dest, uint32
+ 
+ #if defined(INTEL_SSE4_PCLMUL) 
+ 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static 
+ void
+ gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, 
uint32_t val, int bytes, int xor)
+@@ -175,6 +181,9 @@ gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void 
*src, void *dest, uint32
+ #endif
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static 
+ void
+ gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, 
uint32_t val, int bytes, int xor)
+@@ -350,6 +359,9 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
+ 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -385,6 +397,9 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, 
gf_val_32_t b32)
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
+ 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static 
+ void
+ gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, 
uint32_t val, int bytes, int xor)
+@@ -438,6 +453,9 @@ gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void 
*src, void *dest, uint32
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
+ 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -483,6 +501,9 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, 
gf_val_32_t b32)
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
+ 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -522,6 +543,9 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, 
gf_val_32_t b32)
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
+ 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -1057,6 +1081,9 @@ gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void 
*src, void *dest, gf_val_32_
+       v = _mm_srli_epi64(v, 1); }
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void
+ gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -1252,6 +1279,9 @@ gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void 
*src, void *dest, gf_val_32_
+ }
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void
+ gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct 
gf_w32_bytwo_data *btd)
+@@ -1277,6 +1307,9 @@ gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, 
struct gf_w32_bytwo_data *
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void
+ gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data 
*btd)
+@@ -1305,6 +1338,9 @@ gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, 
struct gf_w32_bytwo_data *bt
+ 
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void 
+ gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -1627,6 +1663,9 @@ gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void 
*src, void *dest, uint32_t
+ }
+ 
+ #ifdef INTEL_SSSE3
++#ifndef __SSSE3__
++__attribute__((target("ssse3")))
++#endif
+ static
+ void
+ gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, 
uint32_t val, int bytes, int xor)
+@@ -1763,6 +1802,9 @@ gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void 
*src, void *dest, uint32_t
+ }
+ 
+ #ifdef INTEL_SSSE3
++#ifndef __SSSE3__
++__attribute__((target("ssse3")))
++#endif
+ static
+ void
+ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void 
*dest, uint32_t val, int bytes, int xor)
+@@ -1954,6 +1996,9 @@ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t 
*gf, void *src, void *des
+ 
+ 
+ #ifdef INTEL_SSSE3
++#ifndef __SSSE3__
++__attribute__((target("ssse3")))
++#endif
+ static
+ void
+ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, 
uint32_t val, int bytes, int xor)
+diff --git a/src/gf_w4.c b/src/gf_w4.c
+index 3a7b953..fedba30 100644
+--- a/src/gf_w4.c
++++ b/src/gf_w4.c
+@@ -136,6 +136,9 @@ gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t 
b)
+ /* Ben: This function works, but it is 33% slower than the normal shift mult 
*/
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -374,6 +377,9 @@ gf_w4_single_table_multiply_region(gf_t *gf, void *src, 
void *dest, gf_val_32_t
+ #define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); 
_mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", 
blah[i]); printf("\n"); }
+ 
+ #ifdef INTEL_SSSE3
++#ifndef __SSSE3__
++__attribute__((target("ssse3")))
++#endif
+ static
+ void 
+ gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -877,6 +883,9 @@ gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, 
void *dest, gf_val_32_t
+       v = _mm_srli_epi64(v, 1); }
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void
+ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -928,6 +937,9 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, 
void *dest, gf_val_32_t v
+ 
+ /*
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void 
+ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -993,6 +1005,9 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, 
void *dest, gf_val_32_t v
+ */
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data 
*btd)
+@@ -1017,6 +1032,9 @@ gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+@@ -1043,6 +1061,9 @@ gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data 
*btd)
+@@ -1068,6 +1089,9 @@ gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+@@ -1096,6 +1120,9 @@ gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ 
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data 
*btd)
+@@ -1122,6 +1149,9 @@ gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+@@ -1148,6 +1178,9 @@ gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data 
*btd)
+@@ -1175,6 +1208,9 @@ gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+@@ -1202,6 +1238,9 @@ gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data 
*btd)
+@@ -1230,6 +1269,9 @@ gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+@@ -1258,6 +1300,9 @@ gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data 
*btd)
+@@ -1285,6 +1330,9 @@ gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static 
+ void
+ gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+@@ -1312,6 +1360,9 @@ gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, 
struct gf_bytwo_data *btd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void 
+ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+diff --git a/src/gf_w64.c b/src/gf_w64.c
+index 69e55db..ff78ad1 100644
+--- a/src/gf_w64.c
++++ b/src/gf_w64.c
+@@ -58,6 +58,9 @@ xor)
+ }
+ 
+ #if defined(INTEL_SSE4_PCLMUL) 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ void
+ gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, 
gf_val_64_t val, int bytes, int
+@@ -145,6 +148,9 @@ xor)
+ #endif
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ void
+ gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, 
gf_val_64_t val, int bytes, int
+@@ -341,6 +347,9 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, 
gf_val_64_t b64)
+ 
+ #if defined(INTEL_SSE4_PCLMUL) 
+ 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_64_t
+@@ -383,6 +392,9 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, 
gf_val_64_t b64)
+  
+ #if defined(INTEL_SSE4_PCLMUL) 
+ 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_64_t
+@@ -425,6 +437,9 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, 
gf_val_64_t b64)
+ 
+ 
+ #if defined(INTEL_SSE4_PCLMUL) 
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+   void
+ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int 
bytes, int xor)
+ {
+@@ -1265,6 +1280,9 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void 
*src, void *dest, gf_val_64_
+ 
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_64_t val, int bytes, int xor)
+ {
+   int i;
+@@ -1329,6 +1347,9 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void 
*src, void *dest, gf_val_
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void
+ gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd)
+@@ -1362,6 +1383,9 @@ gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void
+ gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd)
+@@ -1393,6 +1417,9 @@ gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd)
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+ void
+ gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_64_t val, int bytes, int xor)
+@@ -1671,6 +1698,9 @@ int gf_w64_composite_init(gf_t *gf)
+ }
+ 
+ #ifdef INTEL_SSSE3
++#ifndef __SSSE3__
++__attribute__((target("ssse3")))
++#endif
+ static
+   void
+ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void 
*dest, uint64_t val, int bytes, int xor)
+@@ -1755,6 +1785,9 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t 
*gf, void *src, void *des
+ #endif
+ 
+ #ifdef INTEL_SSE4
++#ifndef __SSE4_1__
++__attribute__((target("sse4.1")))
++#endif
+ static
+   void
+ gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, 
uint64_t val, int bytes, int xor)
+diff --git a/src/gf_w8.c b/src/gf_w8.c
+index f647a31..d226925 100644
+--- a/src/gf_w8.c
++++ b/src/gf_w8.c
+@@ -129,6 +129,9 @@ uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
+ 
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -174,6 +177,9 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, 
gf_val_32_t b8)
+ #endif
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -212,6 +218,9 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, 
gf_val_32_t b8)
+ #endif
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ inline
+ gf_val_32_t
+@@ -286,6 +295,9 @@ gf_w8_multiply_region_from_single(gf_t *gf, void *src, 
void *dest, gf_val_32_t v
+ }
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ void
+ gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int
+@@ -344,6 +356,9 @@ gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void 
*src, void *dest, gf_val_
+ #endif
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ void
+ gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int
+@@ -406,6 +421,9 @@ gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void 
*src, void *dest, gf_val_
+ #endif
+ 
+ #if defined(INTEL_SSE4_PCLMUL)
++#ifndef __PCLMUL__
++__attribute__((target("pclmul,sse4.1")))
++#endif
+ static
+ void
+ gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int
+@@ -981,6 +999,9 @@ gf_w8_table_multiply_region(gf_t *gf, void *src, void 
*dest, gf_val_32_t val, in
+ }
+ 
+ #ifdef INTEL_SSSE3
++#ifndef __SSSE3__
++__attribute__((target("ssse3")))
++#endif
+ static
+   void
+ gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t 
val, int bytes, int xor)
+@@ -1606,6 +1627,9 @@ gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, 
void *dest, gf_val_32_t
+   v = _mm_srli_epi64(v, 1); }
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+   void 
+ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+@@ -1661,6 +1685,9 @@ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, 
void *dest, gf_val_32_t v
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+   void
+ gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data 
*btd)
+@@ -1686,6 +1713,9 @@ gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, 
struct gf_w8_bytwo_data *bt
+ #endif
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+   void
+ gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data 
*btd)
+@@ -1714,6 +1744,9 @@ gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, 
struct gf_w8_bytwo_data *btd)
+ 
+ 
+ #ifdef INTEL_SSE2
++#ifndef __SSE2__
++__attribute__((target("sse2")))
++#endif
+ static
+   void 
+ gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, 
gf_val_32_t val, int bytes, int xor)
+-- 
+2.11.0
+
diff -Nru 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0003-Enable-runtime-cpudetection-on-i386.patch
 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0003-Enable-runtime-cpudetection-on-i386.patch
--- 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0003-Enable-runtime-cpudetection-on-i386.patch
      1970-01-01 03:00:00.000000000 +0300
+++ 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0003-Enable-runtime-cpudetection-on-i386.patch
      2019-02-09 13:29:51.000000000 +0300
@@ -0,0 +1,25 @@
+From 03ea8244d34a2800747bac0f0a4355a017f31a7b Mon Sep 17 00:00:00 2001
+From: "Yuriy M. Kaminskiy" <yum...@gmail.com>
+Date: Sun, 10 Feb 2019 11:54:23 +0300
+Subject: [PATCH 3/6] Enable runtime cpudetection on i386
+
+---
+ src/gf_cpu.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/gf_cpu.c b/src/gf_cpu.c
+index f65131f..e113051 100644
+--- a/src/gf_cpu.c
++++ b/src/gf_cpu.c
+@@ -20,7 +20,7 @@ int gf_cpu_supports_intel_sse3 = 0;
+ int gf_cpu_supports_intel_sse2 = 0;
+ int gf_cpu_supports_arm_neon = 0;
+ 
+-#if defined(__x86_64__)
++#if defined(__x86_64__) || defined(__i386)
+ 
+ /* CPUID Feature Bits */
+ 
+-- 
+2.11.0
+
diff -Nru 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0004-Merge-back-unneeded-libgf_util.la.patch
 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0004-Merge-back-unneeded-libgf_util.la.patch
--- 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0004-Merge-back-unneeded-libgf_util.la.patch
        1970-01-01 03:00:00.000000000 +0300
+++ 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0004-Merge-back-unneeded-libgf_util.la.patch
        2019-02-09 13:29:51.000000000 +0300
@@ -0,0 +1,42 @@
+From 44b18e4748b1d5c422d4517648c9bb1267c07bfc Mon Sep 17 00:00:00 2001
+From: "Yuriy M. Kaminskiy" <yum...@gmail.com>
+Date: Sat, 9 Feb 2019 18:56:09 +0300
+Subject: [PATCH 4/6] Merge back unneeded libgf_util.la
+
+As SIMD_FLAGS contains only preprocessor macros, splitting off
+gf_method.c compilation is no longer needed for runtime cpu detection.
+---
+ src/Makefile.am | 14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
+diff --git a/src/Makefile.am b/src/Makefile.am
+index 4949497..5e7dafc 100644
+--- a/src/Makefile.am
++++ b/src/Makefile.am
+@@ -5,20 +5,12 @@ AUTOMAKE_OPTIONS = subdir-objects
+ 
+ AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+ 
+-# avoid using SIMD_FLAGS for code that calls strcmp as new gcc
+-# versions will use SIMD for the strcmp implementation. Instead
+-# we create a static library just for gf_method that is not compiled
+-# with SIMD_FLAGS, this static library will get linked into gf_complete.so
+-noinst_LTLIBRARIES = libgf_util.la
+-libgf_util_la_SOURCES = gf_method.c
+-libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare
+-
+-# we narrowly use SIMD_FLAGS for code that needs it
++noinst_LTLIBRARIES =
+ lib_LTLIBRARIES = libgf_complete.la
+ libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
+-          gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c
++          gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c gf_method.c
+ libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
+-libgf_complete_la_LIBADD = libgf_util.la
++libgf_complete_la_LIBADD =
+ 
+ if HAVE_NEON
+ noinst_LTLIBRARIES += libgf_neon.la
+-- 
+2.11.0
+
diff -Nru 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0007-hack-to-run-testsuite-in-qemu.patch
 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0007-hack-to-run-testsuite-in-qemu.patch
--- 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0007-hack-to-run-testsuite-in-qemu.patch
    1970-01-01 03:00:00.000000000 +0300
+++ 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/0007-hack-to-run-testsuite-in-qemu.patch
    2019-02-09 13:29:51.000000000 +0300
@@ -0,0 +1,26 @@
+From d33f9e3a0f47310145c6b8600a2df66bbd045b01 Mon Sep 17 00:00:00 2001
+From: "Yuriy M. Kaminskiy" <yum...@gmail.com>
+Date: Sun, 10 Feb 2019 20:06:20 +0300
+Subject: [PATCH 7/7] hack to run testsuite in qemu
+
+Not for upstream.
+---
+ tools/Makefile.am | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/Makefile.am b/tools/Makefile.am
+index 4ca9131..ed2717d 100644
+--- a/tools/Makefile.am
++++ b/tools/Makefile.am
+@@ -40,7 +40,7 @@ endif
+ 
+ # gf_unit tests as generated by gf_methods
+ gf_unit_w%.sh: gf_methods
+-      ./$^ $(@:gf_unit_w%.sh=%) -A -U ${VALGRIND} > $@ || rm $@
++      ${QEMU_CMD}./$^ $(@:gf_unit_w%.sh=%) -A -U ${VALGRIND} | perl -p -e 
's|^\.\.|${QEMU_CMD}$$&|' > $@ || rm $@
+ 
+ TESTS = gf_unit_w128.sh \
+         gf_unit_w64.sh  \
+-- 
+2.11.0
+
diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/series 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/series
--- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/series      
1970-01-01 03:00:00.000000000 +0300
+++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/patches/series      
2019-02-09 13:29:51.000000000 +0300
@@ -0,0 +1,5 @@
+0001-Fix-compilation-on-i386.patch
+0002-Fix-runtime-cpudetection.patch
+0003-Enable-runtime-cpudetection-on-i386.patch
+0004-Merge-back-unneeded-libgf_util.la.patch
+0007-hack-to-run-testsuite-in-qemu.patch
diff -Nru gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/rules 
gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/rules
--- gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/rules       2018-05-22 
16:43:40.000000000 +0300
+++ gf-complete-1.0.2+2017.04.10.git.ea75cdf/debian/rules       2019-02-09 
13:29:51.000000000 +0300
@@ -6,23 +6,43 @@
 
 GIT_TAG         ?= $(shell echo '$(DEB_VERSION_UPSTREAM)' | sed -e 's/~/_/')
 
-%:
-       dh $@  --with autoreconf
+ifeq ($(DEB_HOST_ARCH), amd64)
+QEMU_ARCH=x86_64
+# omitted: sse2 (always on amd64), sse3 (not used)
+QEMU_CPUS=qemu64,-ssse3,-sse4.1,-sse4.2,-pclmulqdq
+endif
 
-ifeq ($(DEB_HOST_ARCH_BITS), 32)
-override_dh_auto_configure:
-       dh_auto_configure -- --disable-sse --disable-neon
+ifeq ($(DEB_HOST_ARCH), i386)
+QEMU_ARCH=i386
+# omitted: sse3 (not used)
+QEMU_CPUS=qemu32,-sse2,-ssse3,-sse4.1,-sse4.2,-pclmulqdq
 endif
 
-ifeq ($(DEB_HOST_ARCH), amd64)
+ifeq ($(DEB_HOST_ARCH), armhf)
+ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
+ifeq (neon,$(shell grep -o -w -h -m 1 neon /proc/cpuinfo 2>/dev/null))
+       $(warning NEON detected on the build host, arm-without-neon 
configuration was not tested)
+else
+QEMU_ARCH=arm
+QEMU_CPUS=cortex-a8
+endif
+endif
+endif
+
+%:
+       dh $@  --with autoreconf
+
+ifneq (,$(QEMU_ARCH))
 ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
 override_dh_auto_test:
        dh_auto_test
-       ./libtool --mode=execute qemu-x86_64-static -cpu 
qemu64,-sse3,-ssse3,-sse4.1,-sse4.2 ./test/gf_unit 64 A -1 -
-       ./libtool --mode=execute qemu-x86_64-static -cpu 
qemu64,+sse3,-ssse3,-sse4.1,-sse4.2 ./test/gf_unit 64 A -1 -
-       ./libtool --mode=execute qemu-x86_64-static -cpu 
qemu64,+sse3,+ssse3,-sse4.1,-sse4.2 ./test/gf_unit 64 A -1 -
-       ./libtool --mode=execute qemu-x86_64-static -cpu 
qemu64,+sse3,+ssse3,+sse4.1,-sse4.2 ./test/gf_unit 64 A -1 -
-       ./libtool --mode=execute qemu-x86_64-static -cpu 
qemu64,+sse3,+ssse3,+sse4.1,+sse4.2 ./test/gf_unit 64 A -1 -
+       set -ex; C=$(QEMU_ARCH); c=$(QEMU_CPUS); p=X; \
+       while test "$$p" != "$$c"; do \
+               export QEMU_CMD="$(CURDIR)/libtool --mode=execute 
qemu-$$C-static -cpu $$c " ; \
+               dh_auto_test; \
+               p="$$c"; \
+               c="`echo $$c|sed 's/,-/,+/'`"; \
+       done
 endif
 endif
 

Reply via email to