Re: [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking

2016-04-06 Thread Vijay Kilari
On Tue, Apr 5, 2016 at 8:06 PM, Peter Maydell  wrote:
> On 4 April 2016 at 14:39,   wrote:
>> From: Vijay 
>>
>> Use Neon instructions to perform zero checking of
>> buffer. This is helps in reducing downtime during
>> live migration.
>>
>> Signed-off-by: Vijaya Kumar K 
>> ---
>>  util/cutils.c |   81 
>> +
>>  1 file changed, 81 insertions(+)
>>
>> diff --git a/util/cutils.c b/util/cutils.c
>> index 43d1afb..d343b9a 100644
>> --- a/util/cutils.c
>> +++ b/util/cutils.c
>> @@ -352,6 +352,87 @@ static void 
>> *can_use_buffer_find_nonzero_offset_ifunc(void)
>>  return func;
>>  }
>>  #pragma GCC pop_options
>> +
>> +#elif defined __aarch64__
>> +#include "arm_neon.h"
>
> Can we rely on all compilers having this, or do we need to
> test in configure?

GCC and armcc support the same intrinsics. Both needs inclusion
of arm_neon.h.

>
>> +
>> +#define NEON_VECTYPE   uint64x2_t
>> +#define NEON_LOAD_N_ORR(v1, v2)vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2))
>> +#define NEON_ORR(v1, v2)   vorrq_u64(v1, v2)
>> +#define NEON_EQ_ZERO(v1) \
>> +((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \
>> + (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0)
>
> The intrinsics are a bit confusing, but shouldn't we be
> testing that both lanes of v1 are 0, rather than whether
> either of them is? (so "&&", not "||").

Above check is correct. vceqzq() sets all bits to 1 if value is 0.
So if one lane is 0, then it means it is non-zero buffer. I think
redefining this macro as below would be better and avoid
vceqzq_u64()

#define NEON_NOT_EQ_ZERO(v1) \
((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1)) != 0)

>
>> +
>> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
>> +
>> +/*
>> + * Zero page/buffer checking using SIMD(Neon)
>> + */
>> +
>> +static bool
>> +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
>> +{
>> +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
>> +   * sizeof(NEON_VECTYPE)) == 0
>> +&& ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
>> +}
>> +
>> +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
>> +{
>> +size_t i;
>> +NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
>> +NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;
>> +uint64_t const *data = buf;
>> +
>> +assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
>> +len /= sizeof(unsigned long);
>> +
>> +for (i = 0; i < len; i += 32) {
>> +d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
>> +d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
>> +d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
>> +d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
>> +d4 = NEON_ORR(d0, d1);
>> +d5 = NEON_ORR(d2, d3);
>> +d6 = NEON_ORR(d4, d5);
>> +
>> +d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]);
>> +d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]);
>> +d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]);
>> +d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]);
>> +d11 = NEON_ORR(d7, d8);
>> +d12 = NEON_ORR(d9, d10);
>> +d13 = NEON_ORR(d11, d12);
>> +
>> +d14 = NEON_ORR(d6, d13);
>> +if (NEON_EQ_ZERO(d14)) {
>> +break;
>> +}
>> +}
>
> Both the other optimised find_nonzero implementations in this
> file have two loops, not just one. Is it OK that this
> implementation has only a single loop?
>
> Paolo: do you know why we have two loops in the other
> implementations?

Paolo was right as he mentioned in the previous email.
But with two loops, I don't see much benefit. So restricted to
one loop.

>
>> +
>> +return i * sizeof(unsigned long);
>> +}
>> +
>> +static inline bool neon_support(void)
>> +{
>> +/*
>> + * Check if neon feature is supported.
>> + * By default neon is supported for aarch64.
>> + */
>> +return true;
>> +}
>
> There doesn't seem much point in this. We can assume Neon exists
> on any CPU we're going to run on (it's part of the ABI, the kernel
> assumes it, etc etc). So you can just implement the functions without
> the indirection functions below.
>
 Hmm. One reason was compilation fails if we don't call
can_use_buffer_find_nonzero_offset_inner() function from inside neon
implementation.
So I added this similar to AVX2 intel. Also thought if any platform
does not implement
Neon, then can simply skip changes this function.

>> +
>> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
>> +{
>> +return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, 
>> len) :
>> +   can_use_buffer_find_nonzero_offset_inner(buf, len);
>> +}
>> +
>> +size_t buffer_find_nonzero_offset(const void *buf, size_t len)
>> +{
>> +return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) :
>> +   buffer_find_nonzero_offset_inner(buf, len);
>> +}

Re: [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking

2016-04-05 Thread Peter Maydell
On 5 April 2016 at 16:21, Paolo Bonzini  wrote:
> But in theory it should be enough to add a new #elif branch like this:
>
> #include "arm_neon.h"
> #define VECTYPE   uint64x2_t
> #define VEC_OR(a, b) ((a) | (b))
> #define ALL_EQ(a, b) /* ??? :) */

#define ALL_EQ(a, b) (vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) && \
  vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1))

will do I think (probably suboptimal for a true vector compare but
works OK here as we're actually only interested in comparing against
constant zero; the compiler generates "load 64bit value from vector
register to integer; cbnz" for each half of the vector).

Worth benchmarking that (and the variant where we use the C code
but move the loop unrolling out to 16) against the handwritten
intrinsics version.

thanks
-- PMM



Re: [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking

2016-04-05 Thread Peter Maydell
On 4 April 2016 at 14:39,   wrote:
> From: Vijay 
>
> Use Neon instructions to perform zero checking of
> buffer. This is helps in reducing downtime during
> live migration.

One other comment I forgot:

> +#define NEON_VECTYPE   uint64x2_t

This is a 128-bit type...

> +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +size_t i;
> +NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
> +NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;

...so it's a bit confusing to use d0, d1, etc, which implies
a 64-bit value.

thanks
-- PMM



Re: [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking

2016-04-05 Thread Paolo Bonzini


On 05/04/2016 16:36, Peter Maydell wrote:
>> > +
>> > +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
>> > +
>> > +/*
>> > + * Zero page/buffer checking using SIMD(Neon)
>> > + */
>> > +
>> > +static bool
>> > +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
>> > +{
>> > +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
>> > +   * sizeof(NEON_VECTYPE)) == 0
>> > +&& ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
>> > +}
>> > +
>> > +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
>> > +{
>> > +size_t i;
>> > +NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
>> > +NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;
>> > +uint64_t const *data = buf;
>> > +
>> > +assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
>> > +len /= sizeof(unsigned long);
>> > +
>> > +for (i = 0; i < len; i += 32) {
>> > +d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
>> > +d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
>> > +d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
>> > +d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
>> > +d4 = NEON_ORR(d0, d1);
>> > +d5 = NEON_ORR(d2, d3);
>> > +d6 = NEON_ORR(d4, d5);
>> > +
>> > +d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]);
>> > +d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]);
>> > +d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]);
>> > +d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]);
>> > +d11 = NEON_ORR(d7, d8);
>> > +d12 = NEON_ORR(d9, d10);
>> > +d13 = NEON_ORR(d11, d12);
>> > +
>> > +d14 = NEON_ORR(d6, d13);
>> > +if (NEON_EQ_ZERO(d14)) {
>> > +break;
>> > +}
>> > +}
> Both the other optimised find_nonzero implementations in this
> file have two loops, not just one. Is it OK that this
> implementation has only a single loop?
> 
> Paolo: do you know why we have two loops in the other
> implementations?

Because usually the first one or two iterations are enough to exit the
function if the page is nonzero.  It's measurably slower to go through
the unrolled loop in that case.  On the other hand, once the first few
iterations found only zero bytes, the buffer is very likely entirely
zero and the unrolled loop helps.

But in theory it should be enough to add a new #elif branch like this:

#include "arm_neon.h"
#define VECTYPE   uint64x2_t
#define VEC_OR(a, b) ((a) | (b))
#define ALL_EQ(a, b) /* ??? :) */

around the

/* vector definitions */

comment in util/cutils.c.  GCC should do everything else.

Paolo



Re: [Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking

2016-04-05 Thread Peter Maydell
On 4 April 2016 at 14:39,   wrote:
> From: Vijay 
>
> Use Neon instructions to perform zero checking of
> buffer. This is helps in reducing downtime during
> live migration.
>
> Signed-off-by: Vijaya Kumar K 
> ---
>  util/cutils.c |   81 
> +
>  1 file changed, 81 insertions(+)
>
> diff --git a/util/cutils.c b/util/cutils.c
> index 43d1afb..d343b9a 100644
> --- a/util/cutils.c
> +++ b/util/cutils.c
> @@ -352,6 +352,87 @@ static void 
> *can_use_buffer_find_nonzero_offset_ifunc(void)
>  return func;
>  }
>  #pragma GCC pop_options
> +
> +#elif defined __aarch64__
> +#include "arm_neon.h"

Can we rely on all compilers having this, or do we need to
test in configure?

> +
> +#define NEON_VECTYPE   uint64x2_t
> +#define NEON_LOAD_N_ORR(v1, v2)vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2))
> +#define NEON_ORR(v1, v2)   vorrq_u64(v1, v2)
> +#define NEON_EQ_ZERO(v1) \
> +((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \
> + (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0)

The intrinsics are a bit confusing, but shouldn't we be
testing that both lanes of v1 are 0, rather than whether
either of them is? (so "&&", not "||").

> +
> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
> +
> +/*
> + * Zero page/buffer checking using SIMD(Neon)
> + */
> +
> +static bool
> +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
> +   * sizeof(NEON_VECTYPE)) == 0
> +&& ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
> +}
> +
> +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +size_t i;
> +NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
> +NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;
> +uint64_t const *data = buf;
> +
> +assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
> +len /= sizeof(unsigned long);
> +
> +for (i = 0; i < len; i += 32) {
> +d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
> +d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
> +d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
> +d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
> +d4 = NEON_ORR(d0, d1);
> +d5 = NEON_ORR(d2, d3);
> +d6 = NEON_ORR(d4, d5);
> +
> +d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]);
> +d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]);
> +d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]);
> +d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]);
> +d11 = NEON_ORR(d7, d8);
> +d12 = NEON_ORR(d9, d10);
> +d13 = NEON_ORR(d11, d12);
> +
> +d14 = NEON_ORR(d6, d13);
> +if (NEON_EQ_ZERO(d14)) {
> +break;
> +}
> +}

Both the other optimised find_nonzero implementations in this
file have two loops, not just one. Is it OK that this
implementation has only a single loop?

Paolo: do you know why we have two loops in the other
implementations?

> +
> +return i * sizeof(unsigned long);
> +}
> +
> +static inline bool neon_support(void)
> +{
> +/*
> + * Check if neon feature is supported.
> + * By default neon is supported for aarch64.
> + */
> +return true;
> +}

There doesn't seem much point in this. We can assume Neon exists
on any CPU we're going to run on (it's part of the ABI, the kernel
assumes it, etc etc). So you can just implement the functions without
the indirection functions below.

> +
> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, 
> len) :
> +   can_use_buffer_find_nonzero_offset_inner(buf, len);
> +}
> +
> +size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) :
> +   buffer_find_nonzero_offset_inner(buf, len);
> +}
>  #else
>  bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
>  {
> --

thanks
-- PMM



[Qemu-devel] [RFC PATCH v1 2/2] target-arm: Use Neon for zero checking

2016-04-04 Thread vijayak
From: Vijay 

Use Neon instructions to perform zero checking of
buffer. This is helps in reducing downtime during
live migration.

Signed-off-by: Vijaya Kumar K 
---
 util/cutils.c |   81 +
 1 file changed, 81 insertions(+)

diff --git a/util/cutils.c b/util/cutils.c
index 43d1afb..d343b9a 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -352,6 +352,87 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void)
 return func;
 }
 #pragma GCC pop_options
+
+#elif defined __aarch64__
+#include "arm_neon.h"
+
+#define NEON_VECTYPE   uint64x2_t
+#define NEON_LOAD_N_ORR(v1, v2)vorrq_u64(vld1q_u64(&v1), vld1q_u64(&v2))
+#define NEON_ORR(v1, v2)   vorrq_u64(v1, v2)
+#define NEON_EQ_ZERO(v1) \
+((vgetq_lane_u64(vceqzq_u64(v1), 0) == 0) || \
+ (vgetq_lane_u64(vceqzq_u64(v1), 1)) == 0)
+
+#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16
+
+/*
+ * Zero page/buffer checking using SIMD(Neon)
+ */
+
+static bool
+can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
+{
+return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
+   * sizeof(NEON_VECTYPE)) == 0
+&& ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
+}
+
+static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
+{
+size_t i;
+NEON_VECTYPE d0, d1, d2, d3, d4, d5, d6;
+NEON_VECTYPE d7, d8, d9, d10, d11, d12, d13, d14;
+uint64_t const *data = buf;
+
+assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
+len /= sizeof(unsigned long);
+
+for (i = 0; i < len; i += 32) {
+d0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
+d1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
+d2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
+d3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
+d4 = NEON_ORR(d0, d1);
+d5 = NEON_ORR(d2, d3);
+d6 = NEON_ORR(d4, d5);
+
+d7 = NEON_LOAD_N_ORR(data[i + 16], data[i + 18]);
+d8 = NEON_LOAD_N_ORR(data[i + 20], data[i + 22]);
+d9 = NEON_LOAD_N_ORR(data[i + 24], data[i + 26]);
+d10 = NEON_LOAD_N_ORR(data[i + 28], data[i + 30]);
+d11 = NEON_ORR(d7, d8);
+d12 = NEON_ORR(d9, d10);
+d13 = NEON_ORR(d11, d12);
+
+d14 = NEON_ORR(d6, d13);
+if (NEON_EQ_ZERO(d14)) {
+break;
+}
+}
+
+return i * sizeof(unsigned long);
+}
+
+static inline bool neon_support(void)
+{
+/*
+ * Check if neon feature is supported.
+ * By default neon is supported for aarch64.
+ */
+return true;
+}
+
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf, len) :
+   can_use_buffer_find_nonzero_offset_inner(buf, len);
+}
+
+size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) :
+   buffer_find_nonzero_offset_inner(buf, len);
+}
 #else
 bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
 {
-- 
1.7.9.5