On 24/10/2016 07:55, vijay.kil...@gmail.com wrote:
> From: Vijaya Kumar K
>
> Thunderx pass2 chip requires explicit prefetch
> instruction to give prefetch hint.
>
> To speed up live migration on Thunderx platform,
> prefetch instruction is added in zero buffer check
> function.The below results show live migration time improvement
> with prefetch instruction. VM with 4 VCPUs, 8GB RAM is migrated.
>
> Without prefetch total migration time is ~13 seconds
> adding prefetch total migration time is 9.5 seconds
>
> Code for decoding cache size is taken from Richard's
> patch
>
> Signed-off-by: Vijaya Kumar K
> ---
> util/bufferiszero.c | 37 -
> 1 file changed, 36 insertions(+), 1 deletion(-)
>
> diff --git a/util/bufferiszero.c b/util/bufferiszero.c
> index 421d945..f50b8df 100644
> --- a/util/bufferiszero.c
> +++ b/util/bufferiszero.c
> @@ -25,6 +25,10 @@
> #include "qemu-common.h"
> #include "qemu/cutils.h"
> #include "qemu/bswap.h"
> +#include
> +
> +static uint32_t cache_line_factor = 1;
Let's express this in bytes, with a default value of 64 (so rename
cache_line_factor->cache_line_size).
> +static uint32_t prefetch_line_dist = 1;
>
> static bool
> buffer_zero_int(const void *buf, size_t len)
> @@ -49,7 +53,8 @@ buffer_zero_int(const void *buf, size_t len)
> const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
>
> for (; p + 8 <= e; p += 8) {
> -__builtin_prefetch(p + 8, 0, 0);
> +__builtin_prefetch(p +
> + (8 * cache_line_factor * prefetch_line_dist), 0, 0);
You should precompute cache_line_bytes * prefetch_line_dist /
sizeof(uint64_t) in a single variable, prefetch_distance. This saves
the effort of loading global variables repeatedly. Then you can do
__builtin_prefetch(p + prefetch_distance, 0, 0);
> if (t) {
> return false;
> }
> @@ -293,6 +298,30 @@ bool test_buffer_is_zero_next_accel(void)
> }
> #endif
>
> +#if defined(__aarch64__)
> +#include "qemu/aarch64-cpuid.h"
> +
> +static void __attribute__((constructor)) aarch64_init_cache_size(void)
> +{
> +uint64_t t;
> +
> +/* Use the DZP block size as a proxy for the cacheline size,
> + since the later is not available to userspace. This seems
> + to work in practice for existing implementations. */
> +asm("mrs %0, dczid_el0" : "=r"(t));
> +if (pow(2, (t & 0xf)) * 4 >= 128) {
> +cache_line_factor = 2;
> +} else {
> +cache_line_factor = 1;
> +}
> +
> +get_aarch64_cpu_id();
> +if (is_thunderx_pass2_cpu()) {
> +prefetch_line_dist = 3;
> +}
> +}
> +#endif
> +
> /*
> * Checks if a buffer is all zeroes
> */
> @@ -305,6 +334,12 @@ bool buffer_is_zero(const void *buf, size_t len)
> /* Fetch the beginning of the buffer while we select the accelerator. */
> __builtin_prefetch(buf, 0, 0);
>
> +#if defined(__aarch64__)
> +if (is_thunderx_pass2_cpu()) {
> +__builtin_prefetch(buf + 16, 0, 0);
> +__builtin_prefetch(buf + 32, 0, 0);
This should not be ThunderX or aarch64 specific; it should be a loop like
prefetch_distance_bytes = prefetch_line_dist * cache_line_size;
for (i = 0; i < prefetch_distance_bytes; i += cache_line_size)
__builtin_prefetch(buf + i, 0, 0);
In the default case, cache_line_bytes == prefetch_distance_bytes (both
are 64) and you will get the same behavior as the existing
__builtin_prefetch(buf, 0, 0);
Thanks,
Paolo
> +}
> +#endif
> /* Use an optimized zero check if possible. Note that this also
> includes a check for an unrolled loop over 64-bit integers. */
> return select_accel_fn(buf, len);
>