[vpp-dev] Debug image build error on Arm CPUs

Lijian Zhang Tue, 18 Feb 2020 00:39:15 -0800

Hi Damjan,
There's a compiling error with the debug image with the code segment below.
The error message is pasted as follow.


>>static_always_inline u8x16
   u8x16_word_shift_left (u8x16 x, const int n)
   {
     return vextq_u8 (u8x16_splat (0), x, 16 - n);
   }

In function 'vextq_u8',
    inlined from 'u8x16_word_shift_left' at
vpp/src/vppinfra/vector_neon.h:188:10:
/usr/lib/gcc/aarch64-linux-gnu/8/include/arm_neon.h:16646:3: error: lane index 
must be a constant immediate
   __AARCH64_LANE_CHECK (__a, __c);
   ^~~~~~~~~~~~~~~~~~~~

The root-cause is the third parameter of  vextq_u8 (uint8x16_t __a, uint8x16_t 
__b, __const int __c) requires a compile-time immediate value instead of a 
const variable, because the intrinsic eventually maps down to the instruction:
EXT Vd.16B,Vn.16B,Vm.16B,#n
Where #n is the operand __c. Unfortunately there is no good way of expressing 
that requirement in the C language, so "const int" is the best compiler team 
can do in the current implementation, but the __AARCH64_LANE is there to 
enforce that constraint.

This failure is observed when building debug image, but not happening when 
building release image. Because GCC can get the instant number for operand __c 
at compile-time via the inlined functions. If the __always_inline attribute is 
removed, GCC will not be able to figure out the instant number, and will report 
errors. That's why VPP release image is fine, while debug image is reporting 
errors.
#include "arm_neon.h"
static inline __attribute__ ((__always_inline__)) uint8x16_t shift_left 
(uint8x16_t x, const int n)
{
  return vextq_u8 (x, x, n+1);
}
uint8x16_t shift_left2 (uint8x16_t x) { return shift_left (x, 1); }
const int a = 3;
uint8x16_t shift_left3 (uint8x16_t x) { return shift_left (x, a); }

However, the above inline solution is not accepted by Clang. Clang will report 
error with above code no matter the function is inlined or not.

For debug image, I tried to add __always_inline attribute for function 
u8x16_word_shift_left (), but it doesn't work. It seems the callee and even 
several upper layer callee of function u8x16_word_shift_left should be added 
with attribute __always_inline. I didn't manage to resolve the errors this 
ways, as it seems there are too many places requires such code change.

One solution in my mind is, making a wrapper for vextq_u8() with an array of 
functions _vextq_u8[]. Each entry of the function array is calling vextq_u8 
with an instant number. Below is some pseudo code. A complete patch is attached 
in the end of the email.
Could you please share your idea on this solution and suggest on this issue?



+static u8x16                        \

+ u8x16_word_shift_left (u8x16 x, const int n)       \

+{  return (u8x16) _vextq_u8 [c - n] (vdupq_n_u8 (0), x); }    \



static u8x16                   \

_vextq_u8_0 (u8x16 a, u8x16 b)       \

{ return (u8x16) vextq_u8 (a, b, 0); }

static u8x16                   \

_vextq_u8_1 (u8x16 a, u8x16 b)       \

{ return (u8x16) vextq_u8 (a, b, 1); }

....

static u8x16                   \

_vextq_u8_16 (u8x16 a, u8x16 b)       \

{ return (u8x16) vextq_u8 (a, b, 16); }



+static u8x16 (* _vextq_u8 [16]) (u8x16, u8x16) = { \

+   _vextq_u8_0, \

+   _vextq_u8_1, \

+   _vextq_u8_2, \

+   _vextq_u8_3, \

+   _vextq_u8_4, \

+   _vextq_u8_5, \

+   _vextq_u8_6, \

+   _vextq_u8_7, \

+   _vextq_u8_8, \

+   _vextq_u8_9, \

+   _vextq_u8_10, \

+   _vextq_u8_11, \

+   _vextq_u8_12, \

+   _vextq_u8_13, \

+   _vextq_u8_14, \

+   _vextq_u8_15, \

+}; \



Complete patch:
diff --git a/src/vppinfra/vector_neon.h b/src/vppinfra/vector_neon.h
index 3855f55ad..de77d0864 100644
--- a/src/vppinfra/vector_neon.h
+++ b/src/vppinfra/vector_neon.h
@@ -46,6 +46,81 @@ u8x16_compare_byte_mask (u8x16 v)
    return (u32) (vgetq_lane_u64 (x64, 0) + (vgetq_lane_u64 (x64, 1) << 8));
  }

+/* *INDENT-OFF* */
+#define foreach_neon_vec128i \
+  _(i,8,16,s8,0) _(i,8,16,s8,1) _(i,8,16,s8,2) _(i,8,16,s8,3) \
+  _(i,8,16,s8,4) _(i,8,16,s8,5) _(i,8,16,s8,6) _(i,8,16,s8,7) \
+  _(i,8,16,s8,8) _(i,8,16,s8,9) _(i,8,16,s8,10) _(i,8,16,s8,11) \
+  _(i,8,16,s8,12) _(i,8,16,s8,13) _(i,8,16,s8,14) _(i,8,16,s8,15) \
+  _(i,16,8,s16,0) _(i,16,8,s16,1) _(i,16,8,s16,2) _(i,16,8,s16,3) \
+  _(i,16,8,s16,4) _(i,16,8,s16,5) _(i,16,8,s16,6) _(i,16,8,s16,7) \
+  _(i,32,4,s32,0) _(i,32,4,s32,1) _(i,32,4,s32,2) _(i,32,4,s32,3) \
+  _(i,64,2,s64,0) _(i,64,2,s64,1)
+#define foreach_neon_vec128u \
+  _(u,8,16,u8,0) _(u,8,16,u8,1) _(u,8,16,u8,2) _(u,8,16,u8,3) \
+  _(u,8,16,u8,4) _(u,8,16,u8,5) _(u,8,16,u8,6) _(u,8,16,u8,7) \
+  _(u,8,16,u8,8) _(u,8,16,u8,9) _(u,8,16,u8,10) _(u,8,16,u8,11) \
+  _(u,8,16,u8,12) _(u,8,16,u8,13) _(u,8,16,u8,14) _(u,8,16,u8,15) \
+  _(u,16,8,u16,0) _(u,16,8,u16,1) _(u,16,8,u16,2) _(u,16,8,u16,3) \
+  _(u,16,8,u16,4) _(u,16,8,u16,5) _(u,16,8,u16,6) _(u,16,8,u16,7) \
+  _(u,32,4,u32,0) _(u,32,4,u32,1) _(u,32,4,u32,2) _(u,32,4,u32,3) \
+  _(u,64,2,u64,0) _(u,64,2,u64,1)
+
+#define _(t, s, c, i, n) \
+static t##s##x##c                  \
+_vextq_##i##_##n (t##s##x##c a, t##s##x##c b)      \
+{ return (t##s##x##c) vextq_##i (a, b, n); }
+
+foreach_neon_vec128i foreach_neon_vec128u
+#undef _
+
+#define _(t, i) \
+static t##8x16 (* _vextq_##i##8 [16]) (t##8x16, t##8x16) = { \
+   _vextq_##i##8_0, \
+   _vextq_##i##8_1, \
+   _vextq_##i##8_2, \
+   _vextq_##i##8_3, \
+   _vextq_##i##8_4, \
+   _vextq_##i##8_5, \
+   _vextq_##i##8_6, \
+   _vextq_##i##8_7, \
+   _vextq_##i##8_8, \
+   _vextq_##i##8_9, \
+   _vextq_##i##8_10, \
+   _vextq_##i##8_11, \
+   _vextq_##i##8_12, \
+   _vextq_##i##8_13, \
+   _vextq_##i##8_14, \
+   _vextq_##i##8_15, \
+}; \
+static t##16x8 (* _vextq_##i##16 [8]) (t##16x8, t##16x8) = { \
+   _vextq_##i##16_0, \
+   _vextq_##i##16_1, \
+   _vextq_##i##16_2, \
+   _vextq_##i##16_3, \
+   _vextq_##i##16_4, \
+   _vextq_##i##16_5, \
+   _vextq_##i##16_6, \
+   _vextq_##i##16_7, \
+}; \
+static t##32x4 (* _vextq_##i##32 [4]) (t##32x4, t##32x4) = { \
+   _vextq_##i##32_0, \
+   _vextq_##i##32_1, \
+   _vextq_##i##32_2, \
+   _vextq_##i##32_3, \
+}; \
+static t##64x2 (* _vextq_##i##64 [2]) (t##64x2, t##64x2) = { \
+  (void *) _vextq_##i##64_0, \
+  (void *) _vextq_##i##64_1, \
+};
+_(i, s)
+_(u, u)
+#undef _
+#undef foreach_neon_vec128i
+#undef foreach_neon_vec128u
+/* *INDENT-ON* */
+
  /* *INDENT-OFF* */
  #define foreach_neon_vec128i \
    _(i,8,16,s8) _(i,16,8,s16) _(i,32,4,s32)  _(i,64,2,s64)
@@ -90,7 +165,17 @@ t##s##x##c##_is_greater (t##s##x##c a, t##s##x##c b)        
    \
  \
  static_always_inline t##s##x##c                        \
  t##s##x##c##_blend (t##s##x##c dst, t##s##x##c src, u##s##x##c mask)   \
-{ return (t##s##x##c) vbslq_##i (mask, src, dst); }
+{ return (t##s##x##c) vbslq_##i (mask, src, dst); }            \
+\
+static_always_inline u##s##x##c                        \
+t##s##x##c##_word_shift_left (t##s##x##c x, const int n)       \
+{ /*ASSERT ((c >= n) && (0 < n));*/                    \
+  return (u##s##x##c) _vextq_##i [c - n] (t##s##x##c##_splat (0), x); }    \
+\
+static_always_inline u##s##x##c                        \
+t##s##x##c##_word_shift_right (t##s##x##c x, const int n)      \
+{ /*ASSERT ((0 <= n) && (c > n));*/                    \
+  return (u##s##x##c) _vextq_##i [n] (x, t##s##x##c##_splat (0)); }

  foreach_neon_vec128i foreach_neon_vec128u

Thanks.

-=-=-=-=-=-=-=-=-=-=-=-
Links: You receive all messages sent to this group.

View/Reply Online (#15442): https://lists.fd.io/g/vpp-dev/message/15442
Mute This Topic: https://lists.fd.io/mt/71367647/21656
Group Owner: [email protected]
Unsubscribe: https://lists.fd.io/g/vpp-dev/unsub  [[email protected]]
-=-=-=-=-=-=-=-=-=-=-=-

[vpp-dev] Debug image build error on Arm CPUs

Reply via email to