Is it intentional or not that AArch64 does not define __ARM_NEON__? Otherwise, here's a better way to fold the test bits. AArch64 of course does not have dN+1 overlap the high part of the qM register, like AArch32, so the current
l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t)); implies extra register moves. But on the good side, the armv8 ADDV instruction allows two instructions to be removed from this fast path. When built for 32-bit, the new form results in the same instruction count; we simply keep using "q" registers instead of "d" registers for two more insns. Given that there are currently ifdefs involved, it would certainly be possible to keep the 32-bit path unchanged, if that's thought to be valuable. I did wonder if the armv8 stuff was supposed to be included in the AArch32 arm_neon.h? Is it just an oversight that it's missing? r~ * lex.c (search_line_fast) [__ARM_NEON]: Use __FOO not __FOO__ to detect neon support. Fold the comparison using ADDV when available. diff --git a/libcpp/lex.c b/libcpp/lex.c index 5366dad..6d1823e 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -638,7 +638,7 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) } } -#elif defined (__ARM_NEON__) +#elif defined (__ARM_NEON) #include "arm_neon.h" static const uchar * @@ -649,6 +649,7 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) const uint8x16_t repl_bs = vdupq_n_u8 ('\\'); const uint8x16_t repl_qm = vdupq_n_u8 ('?'); const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL); + const int16x8_t shift = { 0, 0, 0, 0, 8, 8, 8, 8 }; unsigned int misalign, found, mask; const uint8_t *p; @@ -670,10 +671,8 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) do { - uint8x8_t l; - uint16x4_t m; - uint32x2_t n; uint8x16_t t, u, v, w; + uint16x8_t l; p += 16; data = vld1q_u8 (p); @@ -685,12 +684,24 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); t = vandq_u8 (vorrq_u8 (v, w), xmask); - l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t)); - m = vpaddl_u8 (l); - n = vpaddl_u16 (m); - - found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n, - vshr_n_u64 ((uint64x1_t) n, 24)), 0); + + l = vpaddlq_u8 (t); + l = vshlq_u16 (l, shift); + + /* ??? Ideally, this would be if (__ARM_ARCH >= 8) since the ADDV insn + reduces the instruction count by two. But vaddvq is not present in + the arm32 arm_neon.h, nor does AArch64 define __ARM_ARCH. */ +#ifdef __aarch64__ + found = vaddvq_u16 (l); +#else + { + uint32x4_t m = vpaddlq_u16 (l); + uint64x2_t n = vpaddlq_u32 (m); + uint64x1_t o = vget_low_u64 (n) + vget_high_u64 (n); + found = vget_lane_u32 ((uint32x2_t)o, 0); + } +#endif + found &= mask; } while (!found); -- 1.9.3