Hi,
Please find attached the patch that adds first set of X86 instrinsic
headers to AArch64 target.
The implementation is based on similar work targeted at PPC64LE.
https://gcc.gnu.org/ml/gcc-patches/2017-05/msg00550.html
We are using the corresponding DejaGnu tests similar to Powerpc from
gcc/testsuite/gcc.target/i386/ to gcc/testsuite/gcc.target/aarch64 as the
source remains same. Only modifications are target related as appropriate.
Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if any comments or suggestions.
Thanks,
Naveen
2017-05-29 Naveen H.S <naveen.hurugalaw...@cavium.com>
[gcc]
* config.gcc (aarch64*-*-*): Add bmi2intrin.h, bmiintrin.h,
and x86intrin.h
* config/aarch64/bmi2intrin.h: New file.
* config/aarch64/bmiintrin.h: New file.
* config/aarch64/x86intrin.h: New file.
[gcc/testsuite]
* gcc.target/aarch64/bmi-andn-1.c: New file
* gcc.target/aarch64/bmi-andn-2.c: New file.
* gcc.target/aarch64/bmi-bextr-1.c: New file.
* gcc.target/aarch64/bmi-bextr-2.c: New file.
* gcc.target/aarch64/bmi-bextr-4.c: New file.
* gcc.target/aarch64/bmi-bextr-5.c: New file.
* gcc.target/aarch64/bmi-blsi-1.c: New file.
* gcc.target/aarch64/bmi-blsi-2.c: New file.
* gcc.target/aarch64/bmi-blsmsk-1.c: new file.
* gcc.target/aarch64/bmi-blsmsk-2.c: New file.
* gcc.target/aarch64/bmi-blsr-1.c: New file.
* gcc.target/aarch64/bmi-blsr-2.c: New File.
* gcc.target/aarch64/bmi-check.h: New File.
* gcc.target/aarch64/bmi-tzcnt-1.c: new file.
* gcc.target/aarch64/bmi-tzcnt-2.c: New file.
* gcc.target/aarch64/bmi2-bzhi32-1.c: New file.
* gcc.target/aarch64/bmi2-bzhi64-1.c: New file.
* gcc.target/aarch64/bmi2-bzhi64-1a.c: New file.
* gcc.target/aarch64/bmi2-check.h: New file.
* gcc.target/aarch64/bmi2-mulx32-1.c: New file.
* gcc.target/aarch64/bmi2-mulx32-2.c: New file.
* gcc.target/aarch64/bmi2-mulx64-1.c: New file.
* gcc.target/aarch64/bmi2-mulx64-2.c: New file.
* gcc.target/aarch64/bmi2-pdep32-1.c: New file.
* gcc.target/aarch64/bmi2-pdep64-1.c: New file.
* gcc.target/aarch64/bmi2-pext32-1.c: New File.
* gcc.target/aarch64/bmi2-pext64-1.c: New file.
* gcc.target/aarch64/bmi2-pext64-1a.c: New File.
diff --git a/gcc/config.gcc b/gcc/config.gcc
index f55dcaa..9eac70e 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -301,6 +301,7 @@ m32c*-*-*)
aarch64*-*-*)
cpu_type=aarch64
extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
+ extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h x86intrin.h"
c_target_objs="aarch64-c.o"
cxx_target_objs="aarch64-c.o"
extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o"
diff --git a/gcc/config/aarch64/bmi2intrin.h b/gcc/config/aarch64/bmi2intrin.h
new file mode 100644
index 0000000..c797f22
--- /dev/null
+++ b/gcc/config/aarch64/bmi2intrin.h
@@ -0,0 +1,148 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to Aarch64.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+
+#if !defined _X86INTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+ return ((__X << (32 - __Y)) >> (32 - __Y));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+ unsigned long long __res = (unsigned long long) __X * __Y;
+ *__P = (unsigned int) (__res >> 32);
+ return (unsigned int) __res;
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return ((__X << (64 - __Y)) >> (64 - __Y));
+}
+
+/* __int128 requires base 64-bit. */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+ unsigned long long *__P)
+{
+ unsigned __int128 __res = (unsigned __int128) __X * __Y;
+ *__P = (unsigned long long) (__res >> 64);
+ return (unsigned long long) __res;
+}
+
+#ifndef __ILP32__
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __M)
+{
+ unsigned long result = 0x0UL;
+ const unsigned long mask = 0x8000000000000000UL;
+ unsigned long m = __M;
+ unsigned long c, t;
+ unsigned long p;
+
+ /* The pop-count of the mask gives the number of the bits from
+ source to process. This is also needed to shift bits from the
+ source into the correct position for the result. */
+ p = 64 - __builtin_popcountl (__M);
+
+ /* The loop is for the number of '1' bits in the mask and clearing
+ each mask bit as it is processed. */
+ while (m != 0)
+ {
+ c = __builtin_clzl (m);
+ t = __X << (p - c);
+ m ^= (mask >> c);
+ result |= (t & (mask >> c));
+ p++;
+ }
+ return (result);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __M)
+{
+ unsigned long p = 0x4040404040404040UL; // initial bit permute control
+ const unsigned long mask = 0x8000000000000000UL;
+ unsigned long m = __M;
+ unsigned long c;
+ unsigned long result;
+
+ p = 64 - __builtin_popcountl (__M);
+ result = 0;
+ /* We could a use a for loop here, but that combined with
+ -funroll-loops can expand to a lot of code. The while
+ loop avoids unrolling and the compiler commons the xor
+ from clearing the mask bit with the (m != 0) test. The
+ result is a more compact loop setup and body. */
+ while (m != 0)
+ {
+ unsigned long t;
+ c = __builtin_clzl (m);
+ t = (__X & (mask >> c)) >> (p - c);
+ m ^= (mask >> c);
+ result |= (t);
+ p++;
+ }
+ return (result);
+}
+
+/* these 32-bit implementations depend on 64-bit pdep/pext. */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+ return _pdep_u64 (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+ return _pext_u64 (__X, __Y);
+}
+
+#endif /* __ILP32__ */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
diff --git a/gcc/config/aarch64/bmiintrin.h b/gcc/config/aarch64/bmiintrin.h
new file mode 100644
index 0000000..b418a3f
--- /dev/null
+++ b/gcc/config/aarch64/bmiintrin.h
@@ -0,0 +1,208 @@
+/* Copyright (C) 2010-2017 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to Aarch64.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+
+#if !defined _X86INTRIN_H_INCLUDED
+# error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _BMIINTRIN_H_INCLUDED
+#define _BMIINTRIN_H_INCLUDED
+
+extern __inline unsigned short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u16 (unsigned short __X)
+{
+ return __builtin_ctz (__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u32 (unsigned int __X, unsigned int __Y)
+{
+ return (~__X & __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u32 (unsigned int __X, unsigned int __P, unsigned int __L)
+{
+ return ((__X << (32 - (__L + __P))) >> (32 - __L));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u32 (unsigned int __X, unsigned int __Y)
+{
+ unsigned int __P, __L;
+ __P = __Y & 0xFF;
+ __L = (__Y >> 8) & 0xFF;
+ return (_bextr_u32 (__X, __P, __L));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u32 (unsigned int __X)
+{
+ return (__X & -__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsi_u32 (unsigned int __X)
+{
+ return __blsi_u32 (__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u32 (unsigned int __X)
+{
+ return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsmsk_u32 (unsigned int __X)
+{
+ return __blsmsk_u32 (__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u32 (unsigned int __X)
+{
+ return (__X & (__X - 1));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsr_u32 (unsigned int __X)
+{
+ return __blsr_u32 (__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u32 (unsigned int __X)
+{
+ return __builtin_ctz (__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u32 (unsigned int __X)
+{
+ return __builtin_ctz (__X);
+}
+
+/* use the 64-bit shift, rotate, and count leading zeros instructions
+ for long long. */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return (~__X & __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u64 (unsigned long long __X, unsigned int __P, unsigned int __L)
+{
+ return ((__X << (64 - (__L + __P))) >> (64 - __L));
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ unsigned int __P, __L;
+ __P = __Y & 0xFF;
+ __L = (__Y & 0xFF00) >> 8;
+ return (_bextr_u64 (__X, __P, __L));
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u64 (unsigned long long __X)
+{
+ return __X & -__X;
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsi_u64 (unsigned long long __X)
+{
+ return __blsi_u64 (__X);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u64 (unsigned long long __X)
+{
+ return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsmsk_u64 (unsigned long long __X)
+{
+ return __blsmsk_u64 (__X);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u64 (unsigned long long __X)
+{
+ return (__X & (__X - 1));
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsr_u64 (unsigned long long __X)
+{
+ return __blsr_u64 (__X);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u64 (unsigned long long __X)
+{
+ return __builtin_ctzll (__X);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u64 (unsigned long long __X)
+{
+ return __builtin_ctzll (__X);
+}
+
+#endif /* _BMIINTRIN_H_INCLUDED */
diff --git a/gcc/config/aarch64/x86intrin.h b/gcc/config/aarch64/x86intrin.h
new file mode 100644
index 0000000..2044734
--- /dev/null
+++ b/gcc/config/aarch64/x86intrin.h
@@ -0,0 +1,43 @@
+/* Copyright (C) 2008-2017 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to Aarch64.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+#warning "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
+#endif
+
+#ifndef _X86INTRIN_H_INCLUDED
+#define _X86INTRIN_H_INCLUDED
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+
+#endif /* _X86INTRIN_H_INCLUDED */
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-andn-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-andn-1.c
new file mode 100644
index 0000000..2cd8331
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-andn-1.c
@@ -0,0 +1,32 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_andn_u64 (long long src1,
+ long long src2,
+ long long dummy)
+{
+ return (~src1 + dummy) & (src2);
+}
+
+static void
+bmi_test()
+{
+ unsigned i;
+
+ long long src = 0xfacec0ffeefacec0;
+ long long res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_andn_u64 (src, src+i, 0);
+ res = __andn_u64 (src, src+i);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-andn-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-andn-2.c
new file mode 100644
index 0000000..5d58acb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-andn-2.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_andn_u32 (int src1, int src2, int dummy)
+{
+ return (~src1+dummy) & (src2);
+}
+
+static void
+bmi_test()
+{
+ unsigned i;
+
+ int src = 0xfacec0ff;
+ int res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_andn_u32 (src, src+i, 0);
+ res = __andn_u32 (src, src+i);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-bextr-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-1.c
new file mode 100644
index 0000000..1ce15cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-1.c
@@ -0,0 +1,49 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_bextr_u64 (unsigned long long src1,
+ unsigned long long src2)
+{
+ long long res = 0;
+ unsigned char start = (src2 & 0xff);
+ unsigned char len = (int) ((src2 >> 8) & 0xff);
+ if (start < 64) {
+ unsigned i;
+ unsigned last = (start+len) < 64 ? start+len : 64;
+
+ src1 >>= start;
+ for (i=start; i<last; ++i) {
+ res |= (src1 & 1) << (i-start);
+ src1 >>= 1;
+ }
+ }
+
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ unsigned char start, len;
+ unsigned long long src1 = 0xfacec0ffeefacec0;
+ unsigned long long res, res_ref, src2;
+
+ for (i=0; i<5; ++i) {
+ start = (i * 1983) % 64;
+ len = (i + (i * 1983)) % 64;
+
+ src1 = src1 * 3;
+ src2 = start | (((unsigned long long)len) << 8);
+
+ res_ref = calc_bextr_u64 (src1, src2);
+ res = __bextr_u64 (src1, src2);
+
+ if (res != res_ref)
+ abort ();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-bextr-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-2.c
new file mode 100644
index 0000000..cdaf133
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-2.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+unsigned calc_bextr_u32 (unsigned src1, unsigned src2)
+{
+ unsigned res = 0;
+ unsigned char start = (src2 & 0xff);
+ unsigned char len = (int) ((src2 >> 8) & 0xff);
+ if (start < 32) {
+ unsigned i;
+ unsigned last = (start+len) < 32 ? start+len : 32;
+
+ src1 >>= start;
+ for (i=start; i<last; ++i) {
+ res |= (src1 & 1) << (i-start);
+ src1 >>= 1;
+ }
+ }
+
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ unsigned char start, len;
+ unsigned src1 = 0xfacec0ff;
+ unsigned res, res_ref, src2;
+
+ for (i=0; i<5; ++i) {
+ start = (i * 1983) % 32;
+ len = (i + (i * 1983)) % 32;
+
+ src1 = src1 * 3;
+ src2 = start | (((unsigned)len) << 8);
+
+ res_ref = calc_bextr_u32 (src1, src2);
+ res = __bextr_u32 (src1, src2);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-bextr-4.c b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-4.c
new file mode 100644
index 0000000..2f2acbe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-4.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+unsigned calc_bextr_u32 (unsigned src1, unsigned src2)
+{
+ unsigned res = 0;
+ unsigned char start = (src2 & 0xff);
+ unsigned char len = (int) ((src2 >> 8) & 0xff);
+ if (start < 32) {
+ unsigned i;
+ unsigned last = (start+len) < 32 ? start+len : 32;
+
+ src1 >>= start;
+ for (i=start; i<last; ++i) {
+ res |= (src1 & 1) << (i-start);
+ src1 >>= 1;
+ }
+ }
+
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ unsigned char start, len;
+ unsigned src1 = 0xfacec0ff;
+ unsigned res, res_ref, src2;
+
+ for (i=0; i<5; ++i) {
+ start = i * 4;
+ len = i * 4;
+
+ src1 = src1 * 3;
+ src2 = (start & 0xff) | ((len & 0xff) << 8);
+
+ res_ref = calc_bextr_u32 (src1, src2);
+ res = _bextr_u32 (src1, start, len);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-bextr-5.c b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-5.c
new file mode 100644
index 0000000..2cfa24f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-5.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_bextr_u64 (unsigned long long src1,
+ unsigned long long src2)
+{
+ long long res = 0;
+ unsigned char start = (src2 & 0xff);
+ unsigned char len = (int) ((src2 >> 8) & 0xff);
+ if (start < 64) {
+ unsigned i;
+ unsigned last = (start+len) < 64 ? start+len : 64;
+
+ src1 >>= start;
+ for (i=start; i<last; ++i) {
+ res |= (src1 & 1) << (i-start);
+ src1 >>= 1;
+ }
+ }
+
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ unsigned char start, len;
+ unsigned long long src1 = 0xfacec0ffeefacec0;
+ unsigned long long res, res_ref, src2;
+
+ for (i=0; i<5; ++i) {
+ start = i * 4;
+ len = i * 3;
+ src1 = src1 * 3;
+ src2 = (start & 0xff) | ((len & 0xff) << 8);
+
+ res_ref = calc_bextr_u64 (src1, src2);
+ res = _bextr_u64 (src1, start, len);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsi-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsi-1.c
new file mode 100644
index 0000000..8c69a98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsi-1.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/* To fool the compiler, so it does not generate blsi here. */
+long long calc_blsi_u64 (long long src1, long long src2)
+{
+ return (-src1) & (src2);
+}
+
+static void
+bmi_test()
+{
+ unsigned i;
+
+ long long src = 0xfacec0ffeefacec0;
+ long long res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsi_u64 (src, src);
+ res = __blsi_u64 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsi-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsi-2.c
new file mode 100644
index 0000000..8dcac7a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsi-2.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/* To fool the compiler, so it does not generate blsi here. */
+int calc_blsi_u32 (int src1, int src2)
+{
+ return (-src1) & (src2);
+}
+
+static void
+bmi_test()
+{
+ unsigned i;
+ int src = 0xfacec0ff;
+ int res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsi_u32 (src, src);
+ res = __blsi_u32 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-1.c
new file mode 100644
index 0000000..e0856ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-1.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/* Trick compiler in order not to generate target insn here. */
+long long calc_blsmsk_u64 (long long src1, long long src2)
+{
+ return (src1-1) ^ (src2);
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ long long src = 0xfacec0ffeefacec0;
+ long long res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsmsk_u64 (src, src);
+ res = __blsmsk_u64 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-2.c
new file mode 100644
index 0000000..67cdd08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-2.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/* Trick compiler in order not to generate target insn here. */
+int calc_blsmsk_u32 (int src1, int src2)
+{
+ return (src1-1) ^ (src2);
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ int src = 0xfacec0ff;
+ int res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsmsk_u32 (src, src);
+ res = __blsmsk_u32 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsr-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsr-1.c
new file mode 100644
index 0000000..174fac8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsr-1.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_blsr_u64 (long long src1, long long src2)
+{
+ return (src1-1) & (src2);
+}
+
+static void
+bmi_test()
+{
+ unsigned i;
+ long long src = 0xfacec0ffeefacec0;
+ long long res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsr_u64 (src, src);
+ res = __blsr_u64 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsr-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsr-2.c
new file mode 100644
index 0000000..820657c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsr-2.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+int calc_blsr_u32 (int src1, int src2)
+{
+ return (src1-1) & (src2);
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ int src = 0xfacec0ff;
+ int res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsr_u32 (src, src);
+ res = __blsr_u32 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-check.h b/gcc/testsuite/gcc.target/aarch64/bmi-check.h
new file mode 100644
index 0000000..2ddad62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-check.h
@@ -0,0 +1,20 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+static void bmi_test (void);
+
+static void __attribute__ ((noinline)) do_test (void)
+{
+ bmi_test ();
+}
+
+int
+main ()
+{
+ do_test ();
+#ifdef DEBUG
+ printf ("PASSED\n");
+#endif
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-1.c
new file mode 100644
index 0000000..267c4b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-1.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_tzcnt_u64 (long long src)
+{
+ int i;
+ int res = 0;
+
+ while ( (res<64) && ((src&1) == 0)) {
+ ++res;
+ src >>= 1;
+ }
+
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ long long src = 0xfacec0ffeefacec0;
+ long long res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_tzcnt_u64 (src);
+ res = __tzcnt_u64 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-2.c
new file mode 100644
index 0000000..2414c6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-2.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+int calc_tzcnt_u32 (int src)
+{
+ int i;
+ int res = 0;
+
+ while ( (res<32) && ((src&1) == 0)) {
+ ++res;
+ src >>= 1;
+ }
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ int src = 0xfacec0ff;
+ int res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = i + (src << i);
+
+ res_ref = calc_tzcnt_u32 (src);
+ res = __tzcnt_u32 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi32-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi32-1.c
new file mode 100644
index 0000000..35c56ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_bzhi_u32 (unsigned a, int l)
+{
+ unsigned res = a;
+ int i;
+ for (i = 0; i < 32 - l; ++i)
+ res &= ~(1 << (31 - i));
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned src = 0xce7ace0f;
+ unsigned res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_bzhi_u32 (src, i * 2);
+ res = _bzhi_u32 (src, i * 2);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1.c
new file mode 100644
index 0000000..0205aa2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_bzhi_u64 (unsigned long long a, int l)
+{
+ unsigned long long res = a;
+ int i;
+ for (i = 0; i < 64 - l; ++i)
+ res &= ~(1LL << (63 - i));
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned long long src = 0xce7ace0ce7ace0ff;
+ unsigned long long res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_bzhi_u64 (src, i * 2);
+ res = _bzhi_u64 (src, i * 2);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1a.c b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1a.c
new file mode 100644
index 0000000..ce3b8a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1a.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+
+unsigned long long
+test__bzhi_u64_group (unsigned long long a)
+{
+ /* bzhi is implemented in source as shift left then shift right
+ to clear the high order bits.
+ For the case where the starting index is const, the compiler
+ should reduces this to a single Rotate Left Doubleword
+ Immediate then Clear Left (rldicl) instruction. */
+ unsigned long long res;
+ res = _bzhi_u64 (a, 8);
+ res += _bzhi_u64 (a, 16);
+ res += _bzhi_u64 (a, 24);
+ res += _bzhi_u64 (a, 32);
+ res += _bzhi_u64 (a, 40);
+ res += _bzhi_u64 (a, 48);
+ return (res);
+}
+/* the resulting assembler should have 6 X rldicl and no sld or
+ srd instructions. */
+
+/* { dg-final { scan-assembler-not "sld" } } */
+/* { dg-final { scan-assembler-not "srd" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-check.h b/gcc/testsuite/gcc.target/aarch64/bmi2-check.h
new file mode 100644
index 0000000..567cdb7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-check.h
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+static void bmi2_test (void);
+
+static void __attribute__ ((noinline)) do_test (void)
+{
+ bmi2_test ();
+}
+
+int
+main ()
+{
+ do_test ();
+
+#ifdef DEBUG
+ printf ("PASSED\n");
+#endif
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-1.c
new file mode 100644
index 0000000..14357fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-1.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned volatile a, unsigned b)
+{
+ unsigned long long res = 0;
+ int i;
+ for (i = 0; i < b; ++i)
+ res += a;
+
+ return res;
+}
+
+__attribute__((noinline))
+unsigned long long
+gen_mulx (unsigned a, unsigned b)
+{
+ unsigned long long res;
+
+ res = (unsigned long long)a * b;
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned a = 0xce7ace0;
+ unsigned b = 0xfacefff;
+ unsigned long long res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ a = a * (i + 1);
+ b = b / (i + 1);
+
+ res_ref = calc_mul_u32 (a, b);
+ res = gen_mulx (a, b);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-2.c b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-2.c
new file mode 100644
index 0000000..440551f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-2.c
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned volatile a, unsigned b)
+{
+ unsigned long long res = 0;
+ int i;
+ for (i = 0; i < b; ++i)
+ res += a;
+
+ return res;
+}
+
+__attribute__((noinline))
+unsigned calc_mulx_u32 (unsigned x, unsigned y, unsigned *res_h)
+{
+ return (unsigned) _mulx_u32 (x, y, res_h);
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned a = 0xce7ace0;
+ unsigned b = 0xfacefff;
+ unsigned res_l, res_h;
+ unsigned long long res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ a = a * (i + 1);
+ b = b / (i + 1);
+
+ res_ref = calc_mul_u32 (a, b);
+ res_l = calc_mulx_u32 (a, b, &res_h);
+
+ res = ((unsigned long long) res_h << 32) | res_l;
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-1.c
new file mode 100644
index 0000000..eb5f2c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-1.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long volatile a, unsigned long long b)
+{
+ unsigned __int128 res = 0;
+ int i;
+ for (i = 0; i < b; ++i)
+ res += (unsigned __int128) a;
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned long long a = 0xce7ace0ce7ace0;
+ unsigned long long b = 0xface;
+ unsigned __int128 res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ a = a * (i + 1);
+ b = b / (i + 1);
+
+ res_ref = calc_mul_u64 (a, b);
+ res = (unsigned __int128) a * b;
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-2.c b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-2.c
new file mode 100644
index 0000000..8afc1f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-2.c
@@ -0,0 +1,52 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long volatile a, unsigned long long b)
+{
+ unsigned __int128 res = 0;
+ int i;
+ for (i = 0; i < b; ++i)
+ res += (unsigned __int128) a;
+
+ return res;
+}
+
+__attribute__((noinline))
+unsigned long long
+calc_mulx_u64 (unsigned long long x,
+ unsigned long long y,
+ unsigned long long *res_h)
+{
+ return _mulx_u64 (x, y, res_h);
+}
+
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned long long a = 0xce7ace0ce7ace0;
+ unsigned long long b = 0xface;
+ unsigned long long res_l, res_h;
+ unsigned __int128 res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ a = a * (i + 1);
+ b = b / (i + 1);
+
+ res_ref = calc_mul_u64 (a, b);
+
+ res_l = calc_mulx_u64 (a, b, &res_h);
+
+ res = ((unsigned __int128) res_h << 64) | res_l;
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-pdep32-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-pdep32-1.c
new file mode 100644
index 0000000..d08b869
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-pdep32-1.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pdep_u32 (unsigned a, int mask)
+{
+ unsigned res = 0;
+ int i, k = 0;
+
+ for (i = 0; i < 32; ++i)
+ if (mask & (1 << i)) {
+ res |= ((a & (1 << k)) >> k) << i;
+ ++k;
+ }
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned src = 0xce7acc;
+ unsigned res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_pdep_u32 (src, i * 3);
+ res = _pdep_u32 (src, i * 3);
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-pdep64-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-pdep64-1.c
new file mode 100644
index 0000000..1b97ec1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-pdep64-1.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pdep_u64 (unsigned long long a, unsigned long long mask)
+{
+ unsigned long long res = 0;
+ unsigned long long i, k = 0;
+
+ for (i = 0; i < 64; ++i)
+ if (mask & (1LL << i)) {
+ res |= ((a & (1LL << k)) >> k) << i;
+ ++k;
+ }
+ return res;
+}
+
+static
+void
+bmi2_test ()
+{
+ unsigned long long i;
+ unsigned long long src = 0xce7acce7acce7ac;
+ unsigned long long res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_pdep_u64 (src, ~(i * 3));
+ res = _pdep_u64 (src, ~(i * 3));
+
+ if (res != res_ref)
+ abort ();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-pext32-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-pext32-1.c
new file mode 100644
index 0000000..9a8309c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-pext32-1.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pext_u32 (unsigned a, unsigned mask)
+{
+ unsigned res = 0;
+ int i, k = 0;
+
+ for (i = 0; i < 32; ++i)
+ if (mask & (1 << i)) {
+ res |= ((a & (1 << i)) >> i) << k;
+ ++k;
+ }
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned src = 0xce7acc;
+ unsigned res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_pext_u32 (src, ~(i * 3));
+ res = _pext_u32 (src, ~(i * 3));
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1.c
new file mode 100644
index 0000000..a7889f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pext_u64 (unsigned long long a, unsigned long long mask)
+{
+ unsigned long long res = 0;
+ int i, k = 0;
+
+ for (i = 0; i < 64; ++i)
+ if (mask & (1LL << i)) {
+ res |= ((a & (1LL << i)) >> i) << k;
+ ++k;
+ }
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned long long i;
+ unsigned long long src = 0xce7acce7acce7ac;
+ unsigned long long res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_pext_u64 (src, ~(i * 3));
+ res = _pext_u64 (src, ~(i * 3));
+
+ if (res != res_ref)
+ abort();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1a.c b/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1a.c
new file mode 100644
index 0000000..6fa828e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1a.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+
+unsigned long long
+test__pexp_cmask_u64 (unsigned long long a[4])
+{
+ /* The _pext implmentation is nominally a popcount of the mask,
+ followed by a loop using count leading zeros to find the
+ next bit to process.
+ If the mask is a const, the popcount should be folded and
+ the constant propagation should eliminate the mask
+ generation loop and produce a single constant bpermd permute
+ control word.
+ This test verifies that the compiler is replacing the mask
+ popcount and loop with a const bperm control and generating
+ the bpermd for this case. */
+ const unsigned long mask = 0x00000000100000a4UL;
+ unsigned long res;
+ res = _pext_u64 (a[0], mask);
+ res = (res << 8) | _pext_u64 (a[1], mask);
+ res = (res << 8) | _pext_u64 (a[2], mask);
+ res = (res << 8) | _pext_u64 (a[3], mask);
+ return (res);
+}
+/* the resulting assembler should have 4 X bpermd and no popcntd or
+ cntlzd instructions. */
+
+/* { dg-final { scan-assembler-not "popcntd" } } */
+/* { dg-final { scan-assembler-not "cntlzd" } } */