Index: lib/Headers/bmiintrin.h
===================================================================
--- lib/Headers/bmiintrin.h	(revision 207712)
+++ lib/Headers/bmiintrin.h	(working copy)
@@ -32,43 +32,93 @@
 #ifndef __BMIINTRIN_H
 #define __BMIINTRIN_H
 
+/* 
+ * Duplicate all intrinsics with a double underscore variant of each intrinsic.
+ * Apparently, the double underscore flavor was introduced by AMD first,
+ * and even though (1) there are no other x86 intrinsics with double 
+ * underscores and (2) there is no documentation of the double underscore
+ * intrinsics, we should support them to maintain compatibility with GCC.
+ */
+
 static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+_tzcnt_u16(unsigned short __X)
+{
+  return __builtin_ctzs(__X);
+}
+
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
 __tzcnt_u16(unsigned short __X)
 {
   return __builtin_ctzs(__X);
 }
 
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_andn_u32(unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __andn_u32(unsigned int __X, unsigned int __Y)
 {
   return ~__X & __Y;
 }
 
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_bextr_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __bextr_u32(unsigned int __X, unsigned int __Y)
 {
   return __builtin_ia32_bextr_u32(__X, __Y);
 }
 
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_blsi_u32(unsigned int __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __blsi_u32(unsigned int __X)
 {
   return __X & -__X;
 }
 
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __blsmsk_u32(unsigned int __X)
 {
   return __X ^ (__X - 1);
 }
 
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_blsr_u32(unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __blsr_u32(unsigned int __X)
 {
   return __X & (__X - 1);
 }
 
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_tzcnt_u32(unsigned int __X)
+{
+  return __builtin_ctz(__X);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __tzcnt_u32(unsigned int __X)
 {
   return __builtin_ctz(__X);
@@ -76,36 +126,72 @@
 
 #ifdef __x86_64__
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __andn_u64 (unsigned long long __X, unsigned long long __Y)
 {
   return ~__X & __Y;
 }
 
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __bextr_u64(unsigned long long __X, unsigned long long __Y)
 {
   return __builtin_ia32_bextr_u64(__X, __Y);
 }
 
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_blsi_u64(unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __blsi_u64(unsigned long long __X)
 {
   return __X & -__X;
 }
 
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __blsmsk_u64(unsigned long long __X)
 {
   return __X ^ (__X - 1);
 }
 
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_blsr_u64(unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __blsr_u64(unsigned long long __X)
 {
   return __X & (__X - 1);
 }
 
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_tzcnt_u64(unsigned long long __X)
+{
+  return __builtin_ctzll(__X);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __tzcnt_u64(unsigned long long __X)
 {
   return __builtin_ctzll(__X);
Index: test/CodeGen/bmi-builtins.c
===================================================================
--- test/CodeGen/bmi-builtins.c	(revision 207712)
+++ test/CodeGen/bmi-builtins.c	(working copy)
@@ -39,7 +39,7 @@
   return __blsr_u32(__X);
 }
 
-unsigned int test_tzcnt_u32(unsigned int __X) {
+unsigned int test__tzcnt_u32(unsigned int __X) {
   // CHECK: @llvm.cttz.i32
   return __tzcnt_u32(__X);
 }
@@ -77,3 +77,82 @@
   // CHECK: @llvm.cttz.i64
   return __tzcnt_u64(__X);
 }
+
+// Repeat all tests with a single underscore variant of each intrinsic.
+// Apparently, the double underscore flavor was introduced by AMD first,
+// and even though (1) there are no other x86 intrinsics with double 
+// underscores and (2) there is no documentation of the double underscore
+// intrinsics, we should support them to maintain compatibility with GCC.
+
+unsigned short test_tzcnt_u16(unsigned short __X) {
+  // CHECK: @llvm.cttz.i16
+  return _tzcnt_u16(__X);
+}
+
+unsigned int test_andn_u32(unsigned int __X, unsigned int __Y) {
+  // CHECK: [[DEST:%.*]] = xor i32 %{{.*}}, -1
+  // CHECK-NEXT: %{{.*}} = and i32 %{{.*}}, [[DEST]]
+  return _andn_u32(__X, __Y);
+}
+
+unsigned int test_bextr_u32(unsigned int __X, unsigned int __Y) {
+  // CHECK: @llvm.x86.bmi.bextr.32
+  return _bextr_u32(__X, __Y);
+}
+
+unsigned int test_blsi_u32(unsigned int __X) {
+  // CHECK: [[DEST:%.*]] = sub i32 0, [[SRC:%.*]]
+  // CHECK-NEXT: %{{.*}} = and i32 [[SRC]], [[DEST]]
+  return _blsi_u32(__X);
+}
+
+unsigned int test_blsmsk_u32(unsigned int __X) {
+  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
+  // CHECK-NEXT: %{{.*}} = xor i32 [[DEST]], [[SRC]]
+  return _blsmsk_u32(__X);
+}
+
+unsigned int test_blsr_u32(unsigned int __X) {
+  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
+  // CHECK-NEXT: %{{.*}} = and i32 [[DEST]], [[SRC]]
+  return _blsr_u32(__X);
+}
+
+unsigned int test_tzcnt_u32(unsigned int __X) {
+  // CHECK: @llvm.cttz.i32
+  return _tzcnt_u32(__X);
+}
+
+unsigned long long test_andn_u64(unsigned long __X, unsigned long __Y) {
+  // CHECK: [[DEST:%.*]] = xor i64 %{{.*}}, -1
+  // CHECK-NEXT: %{{.*}} = and i64 %{{.*}}, [[DEST]]
+  return _andn_u64(__X, __Y);
+}
+
+unsigned long long test_bextr_u64(unsigned long __X, unsigned long __Y) {
+  // CHECK: @llvm.x86.bmi.bextr.64
+  return _bextr_u64(__X, __Y);
+}
+
+unsigned long long test_blsi_u64(unsigned long long __X) {
+  // CHECK: [[DEST:%.*]] = sub i64 0, [[SRC:%.*]]
+  // CHECK-NEXT: %{{.*}} = and i64 [[SRC]], [[DEST]]
+  return _blsi_u64(__X);
+}
+
+unsigned long long test_blsmsk_u64(unsigned long long __X) {
+  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
+  // CHECK-NEXT: %{{.*}} = xor i64 [[DEST]], [[SRC]]
+  return _blsmsk_u64(__X);
+}
+
+unsigned long long test_blsr_u64(unsigned long long __X) {
+  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
+  // CHECK-NEXT: %{{.*}} = and i64 [[DEST]], [[SRC]]
+  return _blsr_u64(__X);
+}
+
+unsigned long long test_tzcnt_u64(unsigned long long __X) {
+  // CHECK: @llvm.cttz.i64
+  return _tzcnt_u64(__X);
+}