Index: lib/Headers/bmiintrin.h
===================================================================
--- lib/Headers/bmiintrin.h	(revision 207789)
+++ lib/Headers/bmiintrin.h	(working copy)
@@ -32,6 +32,14 @@
 #ifndef __BMIINTRIN_H
 #define __BMIINTRIN_H
 
+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
+/* _bextr_u32 != __bextr_u32 */
+#define _blsi_u32(a)      (__blsi_u32((a)))
+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+#define _blsr_u32(a)      (__blsr_u32((a)))
+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+
 static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
 __tzcnt_u16(unsigned short __X)
 {
@@ -44,13 +52,21 @@
   return ~__X & __Y;
 }
 
+/* AMD-specified, double-leading-underscore version of BEXTR */
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __bextr_u32(unsigned int __X, unsigned int __Y)
 {
   return __builtin_ia32_bextr_u32(__X, __Y);
 }
 
+/* Intel-specified, single-leading-underscore version of BEXTR */
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __blsi_u32(unsigned int __X)
 {
   return __X & -__X;
@@ -75,19 +91,35 @@
 }
 
 #ifdef __x86_64__
+
+#define _andn_u64(a, b)   (__andn_u64((a), (b)))
+/* _bextr_u64 != __bextr_u64 */
+#define _blsi_u64(a)      (__blsi_u64((a)))
+#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
+#define _blsr_u64(a)      (__blsr_u64((a)))
+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __andn_u64 (unsigned long long __X, unsigned long long __Y)
 {
   return ~__X & __Y;
 }
 
+/* AMD-specified, double-leading-underscore version of BEXTR */
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __bextr_u64(unsigned long long __X, unsigned long long __Y)
 {
   return __builtin_ia32_bextr_u64(__X, __Y);
 }
 
+/* Intel-specified, single-leading-underscore version of BEXTR */
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __blsi_u64(unsigned long long __X)
 {
   return __X & -__X;
@@ -110,6 +142,7 @@
 {
   return __builtin_ctzll(__X);
 }
-#endif
 
+#endif /* __x86_64__ */
+
 #endif /* __BMIINTRIN_H */
Index: test/CodeGen/bmi-builtins.c
===================================================================
--- test/CodeGen/bmi-builtins.c	(revision 207789)
+++ test/CodeGen/bmi-builtins.c	(working copy)
@@ -5,6 +5,14 @@
 
 #include <x86intrin.h>
 
+// The double underscore intrinsics are for compatibility with 
+// AMD's BMI interface. The single underscore intrinsics
+// are for compatibility with Intel's BMI interface.
+// Apart from the underscores, the interfaces are identical
+// except in one case: although the 'bextr' register-form 
+// instruction is identical in hardware, the AMD and Intel 
+// intrinsics are different! 
+
 unsigned short test__tzcnt_u16(unsigned short __X) {
   // CHECK: @llvm.cttz.i16
   return __tzcnt_u16(__X);
@@ -39,7 +47,7 @@
   return __blsr_u32(__X);
 }
 
-unsigned int test_tzcnt_u32(unsigned int __X) {
+unsigned int test__tzcnt_u32(unsigned int __X) {
   // CHECK: @llvm.cttz.i32
   return __tzcnt_u32(__X);
 }
@@ -77,3 +85,80 @@
   // CHECK: @llvm.cttz.i64
   return __tzcnt_u64(__X);
 }
+
+// Intel intrinsics
+
+unsigned short test_tzcnt_u16(unsigned short __X) {
+  // CHECK: @llvm.cttz.i16
+  return _tzcnt_u16(__X);
+}
+
+unsigned int test_andn_u32(unsigned int __X, unsigned int __Y) {
+  // CHECK: [[DEST:%.*]] = xor i32 %{{.*}}, -1
+  // CHECK-NEXT: %{{.*}} = and i32 %{{.*}}, [[DEST]]
+  return _andn_u32(__X, __Y);
+}
+
+unsigned int test_bextr_u32(unsigned int __X, unsigned int __Y, 
+                            unsigned int __Z) {
+  // CHECK: @llvm.x86.bmi.bextr.32
+  return _bextr_u32(__X, __Y, __Z);
+}
+
+unsigned int test_blsi_u32(unsigned int __X) {
+  // CHECK: [[DEST:%.*]] = sub i32 0, [[SRC:%.*]]
+  // CHECK-NEXT: %{{.*}} = and i32 [[SRC]], [[DEST]]
+  return _blsi_u32(__X);
+}
+
+unsigned int test_blsmsk_u32(unsigned int __X) {
+  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
+  // CHECK-NEXT: %{{.*}} = xor i32 [[DEST]], [[SRC]]
+  return _blsmsk_u32(__X);
+}
+
+unsigned int test_blsr_u32(unsigned int __X) {
+  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
+  // CHECK-NEXT: %{{.*}} = and i32 [[DEST]], [[SRC]]
+  return _blsr_u32(__X);
+}
+
+unsigned int test_tzcnt_u32(unsigned int __X) {
+  // CHECK: @llvm.cttz.i32
+  return _tzcnt_u32(__X);
+}
+
+unsigned long long test_andn_u64(unsigned long __X, unsigned long __Y) {
+  // CHECK: [[DEST:%.*]] = xor i64 %{{.*}}, -1
+  // CHECK-NEXT: %{{.*}} = and i64 %{{.*}}, [[DEST]]
+  return _andn_u64(__X, __Y);
+}
+
+unsigned long long test_bextr_u64(unsigned long __X, unsigned int __Y, 
+                                  unsigned int __Z) {
+  // CHECK: @llvm.x86.bmi.bextr.64
+  return _bextr_u64(__X, __Y, __Z);
+}
+
+unsigned long long test_blsi_u64(unsigned long long __X) {
+  // CHECK: [[DEST:%.*]] = sub i64 0, [[SRC:%.*]]
+  // CHECK-NEXT: %{{.*}} = and i64 [[SRC]], [[DEST]]
+  return _blsi_u64(__X);
+}
+
+unsigned long long test_blsmsk_u64(unsigned long long __X) {
+  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
+  // CHECK-NEXT: %{{.*}} = xor i64 [[DEST]], [[SRC]]
+  return _blsmsk_u64(__X);
+}
+
+unsigned long long test_blsr_u64(unsigned long long __X) {
+  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
+  // CHECK-NEXT: %{{.*}} = and i64 [[DEST]], [[SRC]]
+  return _blsr_u64(__X);
+}
+
+unsigned long long test_tzcnt_u64(unsigned long long __X) {
+  // CHECK: @llvm.cttz.i64
+  return _tzcnt_u64(__X);
+}
