I just tested with the following test program. The result is as expected: #include <x86intrin.h> #include <cstdio> #include <cstdint>
int main(void) { printf("0 -> %d\n\n", 16U); puts("__tzcnt_u16:"); printf("0x%x -> %d\n", 0, __tzcnt_u16(0)); for (uint16_t i = 1; i > 0; i <<= 1) { printf("0x%x -> %d\n", i, __tzcnt_u16(i)); } puts("\n__lzcnt16:"); printf("0x%x -> %d\n", 0, __lzcnt16(0)); for (uint16_t i = 1; i > 0; i <<= 1) { printf("0x%x -> %d\n", i, __lzcnt16(i)); } return 0; } g++ -march=native -o intrins intrinsictest.cpp ./intrins Attached is the patch for /usr/include/x86_64-linux-gnu/qt5/QtCore/qalgorithms.h
--- qalgorithms.h.orig 2016-12-18 15:57:30.279325472 +0100 +++ qalgorithms.h 2016-12-18 23:09:33.129324260 +0100 @@ -47,6 +47,10 @@ QT_WARNING_PUSH QT_WARNING_DISABLE_GCC("-Wdeprecated-declarations") QT_WARNING_DISABLE_CLANG("-Wdeprecated-declarations") +#if defined(__BMI__) && defined(__LZCNT__) && !QT_HAS_BUILTIN(__builtin_ctzs) +# include <x86intrin.h> +#endif + /* Warning: The contents of QAlgorithmsPrivate is not a part of the public Qt API and may be changed from version to version or even be completely removed. @@ -626,8 +630,10 @@ Q_DECL_RELAXED_CONSTEXPR inline uint qCo Q_DECL_RELAXED_CONSTEXPR inline uint qCountTrailingZeroBits(quint16 v) Q_DECL_NOTHROW { #if defined(Q_CC_GNU) -# if QT_HAS_BUILTIN(__builtin_ctzs) || (defined(__LZCNT__) && defined(__BMI__)) +# if QT_HAS_BUILTIN(__builtin_ctzs) return v ? __builtin_ctzs(v) : 16U; +# elif (defined(__LZCNT__) && defined(__BMI__)) + return __tzcnt_u16(v); # else return v ? __builtin_ctz(v) : 16U; # endif @@ -689,8 +695,10 @@ Q_DECL_RELAXED_CONSTEXPR inline uint qCo Q_DECL_RELAXED_CONSTEXPR inline uint qCountLeadingZeroBits(quint16 v) Q_DECL_NOTHROW { #if defined(Q_CC_GNU) -# if QT_HAS_BUILTIN(__builtin_clzs) || (defined(__LZCNT__) && defined(__BMI__)) +# if QT_HAS_BUILTIN(__builtin_clzs) return v ? __builtin_clzs(v) : 16U; +# elif (defined(__LZCNT__) && defined(__BMI__)) + return __lzcnt16(v); # else return v ? __builtin_clz(v)-16U : 16U; # endif