copy_buffer benchmark

Marek Olšák Mon, 20 Aug 2018 22:51:26 -0700

From: Marek Olšák <marek.ol...@amd.com>

---
 src/gallium/drivers/radeonsi/Makefile.sources |   2 +-
 src/gallium/drivers/radeonsi/meson.build      |   2 +-
 src/gallium/drivers/radeonsi/si_blit.c        |   2 +-
 src/gallium/drivers/radeonsi/si_cp_dma.c      |   8 +-
 src/gallium/drivers/radeonsi/si_pipe.c        |   8 +-
 src/gallium/drivers/radeonsi/si_pipe.h        |   9 +-
 .../drivers/radeonsi/si_shaderlib_tgsi.c      | 102 ++++
 .../drivers/radeonsi/si_test_clearbuffer.c    | 139 ------
 .../drivers/radeonsi/si_test_dma_perf.c       | 470 ++++++++++++++++++
 9 files changed, 590 insertions(+), 152 deletions(-)
 delete mode 100644 src/gallium/drivers/radeonsi/si_test_clearbuffer.c
 create mode 100644 src/gallium/drivers/radeonsi/si_test_dma_perf.c


diff --git a/src/gallium/drivers/radeonsi/Makefile.sources 
b/src/gallium/drivers/radeonsi/Makefile.sources
index b52db3a0598..abdc4e07f1e 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -37,22 +37,22 @@ C_SOURCES := \
        si_shader_tgsi_setup.c \
        si_shaderlib_tgsi.c \
        si_state.c \
        si_state_binning.c \
        si_state_draw.c \
        si_state_msaa.c \
        si_state_shaders.c \
        si_state_streamout.c \
        si_state_viewport.c \
        si_state.h \
-       si_test_clearbuffer.c \
        si_test_dma.c \
+       si_test_dma_perf.c \
        si_texture.c \
        si_uvd.c \
        ../radeon/r600_perfcounter.c \
        ../radeon/radeon_uvd.c \
        ../radeon/radeon_uvd.h \
        ../radeon/radeon_vcn_dec.c \
        ../radeon/radeon_vcn_dec.h \
        ../radeon/radeon_vcn_enc_1_2.c \
        ../radeon/radeon_vcn_enc.c \
        ../radeon/radeon_vcn_enc.h \
diff --git a/src/gallium/drivers/radeonsi/meson.build 
b/src/gallium/drivers/radeonsi/meson.build
index 57229046de1..4d6044f724b 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -53,22 +53,22 @@ files_libradeonsi = files(
   'si_shader_tgsi_setup.c',
   'si_shaderlib_tgsi.c',
   'si_state.c',
   'si_state.h',
   'si_state_binning.c',
   'si_state_draw.c',
   'si_state_msaa.c',
   'si_state_shaders.c',
   'si_state_streamout.c',
   'si_state_viewport.c',
-  'si_test_clearbuffer.c',
   'si_test_dma.c',
+  'si_test_dma_perf.c',
   'si_texture.c',
   'si_uvd.c',
   '../radeon/r600_perfcounter.c',
   '../radeon/radeon_uvd.c',
   '../radeon/radeon_uvd.h',
   '../radeon/radeon_vcn_enc_1_2.c',
   '../radeon/radeon_vcn_enc.c',
   '../radeon/radeon_vcn_enc.h',
   '../radeon/radeon_vcn_dec.c',
   '../radeon/radeon_vcn_dec.h',
diff --git a/src/gallium/drivers/radeonsi/si_blit.c 
b/src/gallium/drivers/radeonsi/si_blit.c
index cf6495291bd..fcaff80125c 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -903,21 +903,21 @@ void si_resource_copy_region(struct pipe_context *ctx,
        struct si_context *sctx = (struct si_context *)ctx;
        struct si_texture *ssrc = (struct si_texture*)src;
        struct pipe_surface *dst_view, dst_templ;
        struct pipe_sampler_view src_templ, *src_view;
        unsigned dst_width, dst_height, src_width0, src_height0;
        unsigned dst_width0, dst_height0, src_force_level = 0;
        struct pipe_box sbox, dstbox;
 
        /* Handle buffers first. */
        if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
-               si_copy_buffer(sctx, dst, src, dstx, src_box->x, 
src_box->width, 0);
+               si_copy_buffer(sctx, dst, src, dstx, src_box->x, 
src_box->width, 0, -1);
                return;
        }
 
        assert(u_max_sample(dst) == u_max_sample(src));
 
        /* The driver doesn't decompress resources automatically while
         * u_blitter is rendering. */
        si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level,
                                  src_box->z, src_box->z + src_box->depth - 1);
 
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 61be22f28b5..486ae75c77f 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -429,32 +429,34 @@ static void si_cp_dma_realign_engine(struct si_context 
*sctx, unsigned size,
 }
 
 /**
  * Do memcpy between buffers using CP DMA.
  *
  * \param user_flags   bitmask of SI_CPDMA_*
  */
 void si_copy_buffer(struct si_context *sctx,
                    struct pipe_resource *dst, struct pipe_resource *src,
                    uint64_t dst_offset, uint64_t src_offset, unsigned size,
-                   unsigned user_flags)
+                   unsigned user_flags, enum si_cache_policy cache_policy)
 {
        uint64_t main_dst_offset, main_src_offset;
        unsigned skipped_size = 0;
        unsigned realign_size = 0;
        enum si_coherency coher = SI_COHERENCY_SHADER;
-       enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
        bool is_first = true;
 
        if (!size)
                return;
 
+       if (cache_policy == -1)
+               cache_policy = get_cache_policy(sctx, coher);
+
        if (dst != src || dst_offset != src_offset) {
                /* Mark the buffer range of destination as valid (initialized),
                 * so that transfer_map knows it should wait for the GPU when 
mapping
                 * that range. */
                util_range_add(&r600_resource(dst)->valid_buffer_range, 
dst_offset,
                               dst_offset + size);
        }
 
        dst_offset += r600_resource(dst)->gpu_address;
        src_offset += r600_resource(src)->gpu_address;
@@ -532,21 +534,21 @@ void si_copy_buffer(struct si_context *sctx,
        /* If it's not a prefetch... */
        if (dst_offset != src_offset)
                sctx->num_cp_dma_calls++;
 }
 
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource 
*buf,
                              uint64_t offset, unsigned size)
 {
        assert(sctx->chip_class >= CIK);
 
-       si_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL);
+       si_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL, 
L2_LRU);
 }
 
 static void cik_prefetch_shader_async(struct si_context *sctx,
                                      struct si_pm4_state *state)
 {
        struct pipe_resource *bo = &state->bo[0]->b.b;
        assert(state->nbo == 1);
 
        cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 13fcf1f3aea..c259c260550 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -94,21 +94,21 @@ static const struct debug_named_value debug_options[] = {
        { "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." },
        { "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main 
framebuffer" },
        { "nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA" },
        { "nofmask", DBG(NO_FMASK), "Disable MSAA compression" },
 
        /* Tests: */
        { "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." },
        { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and 
exit." },
        { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault 
test and exit." },
        { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM 
fault test and exit." },
-       { "testclearbufperf", DBG(TEST_CLEARBUF_PERF), "Test Clearbuffer 
Performance" },
+       { "testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance" },
 
        DEBUG_NAMED_VALUE_END /* must be last */
 };
 
 static void si_init_compiler(struct si_screen *sscreen,
                             struct ac_llvm_compiler *compiler)
 {
        /* Only create the less-optimizing version of the compiler on APUs
         * predating Ryzen (Raven). */
        bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram &&
@@ -723,21 +723,21 @@ static void si_test_vmfault(struct si_screen *sscreen)
                pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 
64);
 
        if (!buf) {
                puts("Buffer allocation failed.");
                exit(1);
        }
 
        r600_resource(buf)->gpu_address = 0; /* cause a VM fault */
 
        if (sscreen->debug_flags & DBG(TEST_VMFAULT_CP)) {
-               si_copy_buffer(sctx, buf, buf, 0, 4, 4, 0);
+               si_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, -1);
                ctx->flush(ctx, NULL, 0);
                puts("VM fault test: CP - done.");
        }
        if (sscreen->debug_flags & DBG(TEST_VMFAULT_SDMA)) {
                sctx->dma_clear_buffer(sctx, buf, 0, 4, 0);
                ctx->flush(ctx, NULL, 0);
                puts("VM fault test: SDMA - done.");
        }
        if (sscreen->debug_flags & DBG(TEST_VMFAULT_SHADER)) {
                util_test_constant_buffer(ctx, buf);
@@ -1063,21 +1063,21 @@ struct pipe_screen *radeonsi_screen_create(struct 
radeon_winsys *ws,
                si_init_compiler(sscreen, &sscreen->compiler[i]);
        for (i = 0; i < num_comp_lo_threads; i++)
                si_init_compiler(sscreen, &sscreen->compiler_lowp[i]);
 
        /* Create the auxiliary context. This must be done last. */
        sscreen->aux_context = si_create_context(&sscreen->b, 0);
 
        if (sscreen->debug_flags & DBG(TEST_DMA))
                si_test_dma(sscreen);
 
-       if (sscreen->debug_flags & DBG(TEST_CLEARBUF_PERF)) {
-               si_test_clearbuffer_perf(sscreen);
+       if (sscreen->debug_flags & DBG(TEST_DMA_PERF)) {
+               si_test_dma_perf(sscreen);
        }
 
        if (sscreen->debug_flags & (DBG(TEST_VMFAULT_CP) |
                                      DBG(TEST_VMFAULT_SDMA) |
                                      DBG(TEST_VMFAULT_SHADER)))
                si_test_vmfault(sscreen);
 
        return &sscreen->b;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index fe06064b388..cfd7622c7a3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -158,21 +158,21 @@ enum {
        DBG_NO_DCC_CLEAR,
        DBG_NO_DCC_FB,
        DBG_NO_DCC_MSAA,
        DBG_NO_FMASK,
 
        /* Tests: */
        DBG_TEST_DMA,
        DBG_TEST_VMFAULT_CP,
        DBG_TEST_VMFAULT_SDMA,
        DBG_TEST_VMFAULT_SHADER,
-       DBG_TEST_CLEARBUF_PERF,
+       DBG_TEST_DMA_PERF,
 };
 
 #define DBG_ALL_SHADERS                (((1 << (DBG_CS + 1)) - 1))
 #define DBG(name)              (1ull << DBG_##name)
 
 struct si_compute;
 struct hash_table;
 struct u_suballocator;
 
 /* Only 32-bit buffer allocations are supported, gallium doesn't support more
@@ -1126,21 +1126,21 @@ void si_cp_dma_wait_for_idle(struct si_context *sctx);
 void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
                            uint64_t offset, uint64_t size, unsigned value,
                            enum si_coherency coher,
                            enum si_cache_policy cache_policy);
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
                     uint64_t offset, uint64_t size, unsigned value,
                     enum si_coherency coher);
 void si_copy_buffer(struct si_context *sctx,
                    struct pipe_resource *dst, struct pipe_resource *src,
                    uint64_t dst_offset, uint64_t src_offset, unsigned size,
-                   unsigned user_flags);
+                   unsigned user_flags, enum si_cache_policy cache_policy);
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource 
*buf,
                              uint64_t offset, unsigned size);
 void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
 void si_init_cp_dma_functions(struct si_context *sctx);
 
 /* si_debug.c */
 void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
                struct radeon_saved_cs *saved, bool get_buffer_list);
 void si_clear_saved_cs(struct radeon_saved_cs *saved);
 void si_destroy_saved_cs(struct si_saved_cs *scs);
@@ -1210,27 +1210,30 @@ bool si_check_device_reset(struct si_context *sctx);
 /* si_query.c */
 void si_init_screen_query_functions(struct si_screen *sscreen);
 void si_init_query_functions(struct si_context *sctx);
 void si_suspend_queries(struct si_context *sctx);
 void si_resume_queries(struct si_context *sctx);
 
 /* si_shaderlib_tgsi.c */
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                        unsigned num_layers);
 void *si_create_fixed_func_tcs(struct si_context *sctx);
+void *si_create_dma_compute_shader(struct pipe_context *ctx,
+                                  unsigned num_dwords_per_thread,
+                                  bool stream_cache_policy, bool is_copy);
 void *si_create_query_result_cs(struct si_context *sctx);
 
 /* si_test_dma.c */
 void si_test_dma(struct si_screen *sscreen);
 
 /* si_test_clearbuffer.c */
-void si_test_clearbuffer_perf(struct si_screen *sscreen);
+void si_test_dma_perf(struct si_screen *sscreen);
 
 /* si_uvd.c */
 struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
                                               const struct pipe_video_codec 
*templ);
 
 struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
                                                 const struct pipe_video_buffer 
*tmpl);
 
 /* si_viewport.c */
 void si_update_vs_viewport_state(struct si_context *ctx);
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c 
b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index 45bc93ed782..911b710abe6 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -112,20 +112,122 @@ void *si_create_fixed_func_tcs(struct si_context *sctx)
        tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
        tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
 
        ureg_MOV(ureg, tessouter, outer);
        ureg_MOV(ureg, tessinner, inner);
        ureg_END(ureg);
 
        return ureg_create_shader_and_destroy(ureg, &sctx->b);
 }
 
+/* Create a compute shader implementing clear_buffer or copy_buffer. */
+void *si_create_dma_compute_shader(struct pipe_context *ctx,
+                                  unsigned num_dwords_per_thread,
+                                  bool stream_cache_policy, bool is_copy)
+{
+       assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
+
+       unsigned qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
+       if (stream_cache_policy)
+               qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+       unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
+       unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
+
+       for (unsigned i = 0; i < num_mem_ops; i++) {
+               if (i*4 < num_dwords_per_thread)
+                       inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4);
+       }
+
+       struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+       if (!ureg)
+               return NULL;
+
+       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
+       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+       struct ureg_src value;
+       if (!is_copy) {
+               ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_DWORDS, 
inst_dwords[0]);
+               value = ureg_DECL_system_value(ureg, 
TGSI_SEMANTIC_CS_USER_DATA, 0);
+       }
+
+       struct ureg_src tid = ureg_DECL_system_value(ureg, 
TGSI_SEMANTIC_THREAD_ID, 0);
+       struct ureg_src blk = ureg_DECL_system_value(ureg, 
TGSI_SEMANTIC_BLOCK_ID, 0);
+       struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), 
TGSI_WRITEMASK_X);
+       struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), 
TGSI_WRITEMASK_X);
+       struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
+       struct ureg_src srcbuf;
+       struct ureg_src *values = NULL;
+
+       if (is_copy) {
+               srcbuf = ureg_DECL_buffer(ureg, 1, false);
+               values = malloc(num_mem_ops * sizeof(struct ureg_src));
+       }
+
+       /* If there are multiple stores, the first store writes into 0+tid,
+        * the 2nd store writes into 64+tid, the 3rd store writes into 128+tid, 
etc.
+        */
+       ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, 64 * num_mem_ops), 
tid);
+       /* Convert from a "store size unit" into bytes. */
+       ureg_UMUL(ureg, store_addr, ureg_src(store_addr),
+                 ureg_imm1u(ureg, 4 * inst_dwords[0]));
+       ureg_MOV(ureg, load_addr, ureg_src(store_addr));
+
+       /* Distance between a load and a store for latency hiding. */
+       unsigned load_store_distance = is_copy ? 8 : 0;
+
+       for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
+               int d = i - load_store_distance;
+
+               if (is_copy && i < num_mem_ops) {
+                       if (i) {
+                               ureg_UADD(ureg, load_addr, ureg_src(load_addr),
+                                         ureg_imm1u(ureg, 4 * inst_dwords[i] * 
64));
+                       }
+
+                       values[i] = ureg_src(ureg_DECL_temporary(ureg));
+                       struct ureg_dst dst =
+                               ureg_writemask(ureg_dst(values[i]),
+                                              u_bit_consecutive(0, 
inst_dwords[i]));
+                       struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
+                       ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 
2,
+                                        qualifier, TGSI_TEXTURE_BUFFER, 0);
+               }
+
+               if (d >= 0) {
+                       if (d) {
+                               ureg_UADD(ureg, store_addr, 
ureg_src(store_addr),
+                                         ureg_imm1u(ureg, 4 * inst_dwords[d] * 
64));
+                       }
+
+                       struct ureg_dst dst =
+                               ureg_writemask(dstbuf, u_bit_consecutive(0, 
inst_dwords[d]));
+                       struct ureg_src srcs[] =
+                               {ureg_src(store_addr), is_copy ? values[d] : 
value};
+                       ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, 
srcs, 2,
+                                        qualifier, TGSI_TEXTURE_BUFFER, 0);
+               }
+       }
+       ureg_END(ureg);
+
+       struct pipe_compute_state state = {};
+       state.ir_type = PIPE_SHADER_IR_TGSI;
+       state.prog = ureg_get_tokens(ureg, NULL);
+
+       void *cs = ctx->create_compute_state(ctx, &state);
+       ureg_destroy(ureg);
+       free(values);
+       return cs;
+}
+
 /* Create the compute shader that is used to collect the results.
  *
  * One compute grid with a single thread is launched for every query result
  * buffer. The thread (optionally) reads a previous summary buffer, then
  * accumulates data from the query result buffer, and writes the result either
  * to a summary buffer to be consumed by the next grid invocation or to the
  * user-supplied buffer.
  *
  * Data layout:
  *
diff --git a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c 
b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c
deleted file mode 100644
index e863381fd15..00000000000
--- a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright 2018 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
THE
- * SOFTWARE.
- *
- */
-
-/* This file implements tests on the si_clearbuffer function. */
-
-#include "si_pipe.h"
-
-#define CLEARBUF_MIN 32
-#define CLEARBUF_COUNT 16
-#define CLEARBUF_MEMSZ 1024
-
-static uint64_t
-measure_clearbuf_time(struct pipe_context *ctx,
-                     uint64_t memory_size)
-{
-       struct pipe_query *query_te;
-       union pipe_query_result qresult;
-       struct pipe_resource *buf;
-
-       struct si_context *sctx = (struct si_context*)ctx;
-       struct pipe_screen *screen = ctx->screen;
-
-       buf = pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, memory_size);
-
-       query_te = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0);
-
-       ctx->begin_query(ctx, query_te);
-       /* operation  */
-       si_cp_dma_clear_buffer(sctx, buf, 0, memory_size, 0x00,
-                              SI_COHERENCY_SHADER, L2_LRU);
-       ctx->end_query(ctx, query_te);
-       ctx->get_query_result(ctx, query_te, true, &qresult);
-
-       /* Cleanup. */
-       ctx->destroy_query(ctx, query_te);
-       pipe_resource_reference(&buf, NULL);
-
-       /* Report Results */
-       return qresult.u64;
-}
-
-/**
- * @brief Analyze rate of clearing a 1K Buffer averaged over 16 iterations
- * @param ctx Context of pipe to perform analysis on
- */
-static void
-analyze_clearbuf_perf_avg(struct pipe_context *ctx)
-{
-       uint index = 0;
-       uint64_t result[CLEARBUF_COUNT];
-       uint64_t sum = 0;
-       long long int rate_kBps;
-
-       /* Run Tests. */
-       for (index = 0 ; index < CLEARBUF_COUNT ; index++) {
-               result[index] = measure_clearbuf_time(ctx, CLEARBUF_MEMSZ);
-               sum += result[index];
-       }
-
-       /* Calculate Results. */
-       /*  kBps = (size(bytes))/(1000) / (time(ns)/(1000*1000*1000)) */
-       rate_kBps = CLEARBUF_COUNT*CLEARBUF_MEMSZ;
-       rate_kBps *= 1000UL*1000UL;
-       rate_kBps /= sum;
-
-       /* Display Results. */
-       printf("CP DMA clear_buffer performance (buffer %lu ,repeat %u ):",
-              (uint64_t)CLEARBUF_MEMSZ,
-              CLEARBUF_COUNT );
-       printf(" %llu kB/s\n", rate_kBps );
-}
-
-/**
- * @brief Analyze rate of clearing a range of Buffer sizes
- * @param ctx Context of pipe to perform analysis on
- */
-static void
-analyze_clearbuf_perf_rng(struct pipe_context *ctx)
-{
-       uint index = 0;
-       uint64_t result[CLEARBUF_COUNT];
-       uint64_t mem_size;
-       long long int rate_kBps;
-
-       /* Run Tests. */
-       mem_size = CLEARBUF_MIN;
-       for (index = 0 ; index < CLEARBUF_COUNT ; index++ ) {
-               result[index] = measure_clearbuf_time(ctx, mem_size);
-               mem_size <<= 1;
-       }
-
-       /* Calculate & Display Results. */
-       /*  kBps = (size(bytes))/(1000) / (time(ns)/(1000*1000*1000)) */
-       mem_size = CLEARBUF_MIN;
-       for (index = 0 ; index < CLEARBUF_COUNT ; index++ ) {
-               rate_kBps = mem_size;
-               rate_kBps *= 1000UL*1000UL;
-               rate_kBps /= result[index];
-
-               printf("CP DMA clear_buffer performance (buffer %lu):",
-                      mem_size );
-               printf(" %llu kB/s\n", rate_kBps );
-
-               mem_size <<= 1;
-       }
-}
-
-void si_test_clearbuffer_perf(struct si_screen *sscreen)
-{
-       struct pipe_screen *screen = &sscreen->b;
-       struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
-
-       analyze_clearbuf_perf_avg(ctx);
-       analyze_clearbuf_perf_rng(ctx);
-
-       exit(0);
-}
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c 
b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
new file mode 100644
index 00000000000..46d31a2e16e
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
THE
+ * SOFTWARE.
+ *
+ */
+
+/* This file implements tests on the si_clearbuffer function. */
+
+#include "si_pipe.h"
+#include "si_query.h"
+
+#define MIN_SIZE       256
+#define MAX_SIZE       (128 * 1024 * 1024)
+#define SIZE_SHIFT     1
+#define NUM_RUNS       128
+
+static double get_MBps_rate(unsigned num_bytes, unsigned ns)
+{
+       return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+}
+
+void si_test_dma_perf(struct si_screen *sscreen)
+{
+       struct pipe_screen *screen = &sscreen->b;
+       struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+       struct si_context *sctx = (struct si_context*)ctx;
+       const uint32_t clear_value = 0x12345678;
+       static const unsigned cs_dwords_per_thread_list[] = {1, 4, 16, 64};
+       static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+
+#define NUM_SHADERS (ARRAY_SIZE(cs_dwords_per_thread_list) * 2)
+#define NUM_METHODS (4 + NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+
+       void *clear_cs[NUM_SHADERS], *copy_cs[NUM_SHADERS];
+
+       for (unsigned i = 0; i < NUM_SHADERS; i++) {
+               clear_cs[i] = si_create_dma_compute_shader(ctx, 
cs_dwords_per_thread_list[i / 2],
+                                                          i % 2, false);
+               copy_cs[i] = si_create_dma_compute_shader(ctx, 
cs_dwords_per_thread_list[i / 2],
+                                                         i % 2, true);
+       }
+
+
+       static const char *method_str[] = {
+               "CP MC   ",
+               "CP L2   ",
+               "CP L2   ",
+               "SDMA    ",
+       };
+       static const char *placement_str[] = {
+               /* Clear */
+               "fill->VRAM",
+               "fill->GTT ",
+               /* Copy */
+               "VRAM->VRAM",
+               "VRAM->GTT ",
+               "GTT ->VRAM",
+       };
+
+       printf("DMA rate is in MB/s for each size. Slow cases are skipped and 
print 0.\n");
+       printf("Heap       ,Method  ,L2p,Wa,");
+       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+               if (size >= 1024)
+                       printf("%6uKB,", size / 1024);
+               else
+                       printf(" %6uB,", size);
+       }
+       printf("\n");
+
+       /* results[log2(size)][placement][method][] */
+       struct si_result {
+               bool is_valid;
+               bool is_cp;
+               bool is_sdma;
+               bool is_cs;
+               unsigned cache_policy;
+               unsigned dwords_per_thread;
+               unsigned waves_per_sh;
+               unsigned score;
+       } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+
+       /* Run benchmarks. */
+       for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); 
placement++) {
+               bool is_copy = placement >= 2;
+
+               printf("-----------,--------,---,--,");
+               for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= 
SIZE_SHIFT)
+                       printf("--------,");
+               printf("\n");
+
+               for (unsigned method = 0; method < NUM_METHODS; method++) {
+                       bool test_cp = method <= 2;
+                       bool test_sdma = method == 3;
+                       bool test_cs = method >= 4;
+                       unsigned cs_method = method - 4;
+                       unsigned cache_policy = test_cp ? method % 3 :
+                                               test_cs ? L2_LRU + cs_method % 
2 : 0;
+                       unsigned cs_shader = cs_method % NUM_SHADERS;
+                       unsigned cs_dwords_per_thread =
+                               test_cs ? cs_dwords_per_thread_list[cs_shader / 
2] : 0;
+                       unsigned cs_waves_per_sh =
+                               test_cs ? cs_waves_per_sh_list[cs_method / 
NUM_SHADERS] : 0;
+
+                       if (sctx->chip_class == SI) {
+                               /* SI doesn't support CP DMA operations through 
L2. */
+                               if (test_cp && cache_policy != L2_BYPASS)
+                                       continue;
+                               /* WAVES_PER_SH is in multiples of 16 on SI. */
+                               if (test_cs && cs_waves_per_sh % 16 != 0)
+                                       continue;
+                       }
+
+                       printf("%s ,", placement_str[placement]);
+                       if (test_cs) {
+                               printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+                                      cache_policy == L2_LRU ? "LRU" :
+                                      cache_policy == L2_STREAM ? "Str" : "");
+                       } else {
+                               printf("%s,%3s,", method_str[method],
+                                      method == L2_LRU ? "LRU" :
+                                      method == L2_STREAM ? "Str" : "");
+                       }
+                       if (test_cs && cs_waves_per_sh)
+                               printf("%2u,", cs_waves_per_sh);
+                       else
+                               printf("  ,");
+
+                       double score = 0;
+                       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size 
<<= SIZE_SHIFT) {
+                               /* Don't test bigger sizes if it's too slow. 
Print 0. */
+                               if (size >= 512*1024 &&
+                                   score < 400 * (size / (4*1024*1024))) {
+                                       printf("%7.0f ,", 0.0);
+                                       continue;
+                               }
+
+                               enum pipe_resource_usage dst_usage, src_usage;
+                               struct pipe_resource *dst, *src;
+                               struct pipe_query *q[NUM_RUNS];
+                               unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+
+                               if (test_sdma) {
+                                       if (sctx->chip_class == SI)
+                                               query_type = 
SI_QUERY_TIME_ELAPSED_SDMA_SI;
+                                       else
+                                               query_type = 
SI_QUERY_TIME_ELAPSED_SDMA;
+                               }
+
+                               if (placement == 0 || placement == 2 || 
placement == 4)
+                                       dst_usage = PIPE_USAGE_DEFAULT;
+                               else
+                                       dst_usage = PIPE_USAGE_STREAM;
+
+                               if (placement == 2 || placement == 3)
+                                       src_usage = PIPE_USAGE_DEFAULT;
+                               else
+                                       src_usage = PIPE_USAGE_STREAM;
+
+                               dst = pipe_buffer_create(screen, 0, dst_usage, 
size);
+                               src = is_copy ? pipe_buffer_create(screen, 0, 
src_usage, size) : NULL;
+
+                               /* Run tests. */
+                               for (unsigned iter = 0; iter < NUM_RUNS; 
iter++) {
+                                       q[iter] = ctx->create_query(ctx, 
query_type, 0);
+                                       ctx->begin_query(ctx, q[iter]);
+
+                                       if (test_cp) {
+                                               /* CP DMA */
+                                               if (is_copy) {
+                                                       si_copy_buffer(sctx, 
dst, src, 0, 0, size, 0,
+                                                                      
cache_policy);
+                                               } else {
+                                                       
si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value,
+                                                                              
SI_COHERENCY_NONE, cache_policy);
+                                               }
+                                       } else if (test_sdma) {
+                                               /* SDMA */
+                                               if (is_copy) {
+                                                       struct pipe_box box;
+                                                       u_box_1d(0, size, &box);
+                                                       sctx->dma_copy(ctx, 
dst, 0, 0, 0, 0, src, 0, &box);
+                                               } else {
+                                                       
sctx->dma_clear_buffer(sctx, dst, 0, size, clear_value);
+                                               }
+                                       } else {
+                                               /* Compute */
+                                               /* The memory accesses are 
coalesced, meaning that the 1st instruction writes
+                                                * the 1st contiguous block of 
data for the whole wave, the 2nd instruction
+                                                * writes the 2nd contiguous 
block of data, etc.
+                                                */
+                                               unsigned 
instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+                                               unsigned dwords_per_instruction 
= cs_dwords_per_thread / instructions_per_thread;
+                                               unsigned dwords_per_wave = 
cs_dwords_per_thread * 64;
+
+                                               unsigned num_dwords = size / 4;
+                                               unsigned num_instructions = 
DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+                                               struct pipe_grid_info info = {};
+                                               info.block[0] = MIN2(64, 
num_instructions);
+                                               info.block[1] = 1;
+                                               info.block[2] = 1;
+                                               info.grid[0] = 
DIV_ROUND_UP(num_dwords, dwords_per_wave);
+                                               info.grid[1] = 1;
+                                               info.grid[2] = 1;
+
+                                               struct pipe_shader_buffer sb[2] 
= {};
+                                               sb[0].buffer = dst;
+                                               sb[0].buffer_size = size;
+
+                                               if (is_copy) {
+                                                       sctx->flags |= 
SI_CONTEXT_INV_VMEM_L1 |
+                                                                      
SI_CONTEXT_INV_SMEM_L1;
+
+                                                       sb[1].buffer = src;
+                                                       sb[1].buffer_size = 
size;
+
+                                                       
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb);
+                                                       
ctx->bind_compute_state(ctx, copy_cs[cs_shader]);
+                                               } else {
+                                                       for (unsigned i = 0; i 
< 4; i++)
+                                                               
sctx->cs_user_data[i] = clear_value;
+
+                                                       
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb);
+                                                       
ctx->bind_compute_state(ctx, clear_cs[cs_shader]);
+                                               }
+
+                                               sctx->cs_max_waves_per_sh = 
cs_waves_per_sh;
+                                               ctx->launch_grid(ctx, &info);
+
+                                               /* Wait and flush L2. */
+                                               sctx->flags |= 
SI_CONTEXT_CS_PARTIAL_FLUSH;
+                                               si_emit_cache_flush(sctx);
+                                               sctx->cs_max_waves_per_sh = 0; 
/* disable the limit */
+                                       }
+                                       ctx->end_query(ctx, q[iter]);
+                                       ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+                               }
+                               pipe_resource_reference(&dst, NULL);
+                               pipe_resource_reference(&src, NULL);
+
+                               /* Get results. */
+                               uint64_t min = ~0ull, max = 0, total = 0;
+
+                               for (unsigned iter = 0; iter < NUM_RUNS; 
iter++) {
+                                       union pipe_query_result result;
+
+                                       ctx->get_query_result(ctx, q[iter], 
true, &result);
+                                       ctx->destroy_query(ctx, q[iter]);
+
+                                       min = MIN2(min, result.u64);
+                                       max = MAX2(max, result.u64);
+                                       total += result.u64;
+                               }
+
+                               score = get_MBps_rate(size, total / 
(double)NUM_RUNS);
+                               printf("%7.0f ,", score);
+                               fflush(stdout);
+
+                               struct si_result *r = 
&results[util_logbase2(size)][placement][method];
+                               r->is_valid = true;
+                               r->is_cp = test_cp;
+                               r->is_sdma = test_sdma;
+                               r->is_cs = test_cs;
+                               r->cache_policy = cache_policy;
+                               r->dwords_per_thread = cs_dwords_per_thread;
+                               r->waves_per_sh = cs_waves_per_sh;
+                               r->score = score;
+                       }
+                       puts("");
+               }
+       }
+
+       puts("");
+       puts("static struct si_method");
+       printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t 
size64, bool async, bool cached)\n",
+              sctx->screen->info.name);
+       puts("{");
+       puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+
+       /* Analyze results and find the best methods. */
+       for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); 
placement++) {
+               if (placement == 0)
+                       puts("   if (dst == RADEON_DOMAIN_VRAM) {");
+               else if (placement == 1)
+                       puts("   } else { /* GTT */");
+               else if (placement == 2) {
+                       puts("}");
+                       puts("");
+                       puts("static struct si_method");
+                       printf("get_best_copy_for_%s(enum radeon_bo_domain dst, 
enum radeon_bo_domain src,\n",
+                              sctx->screen->info.name);
+                       printf("                     uint64_t size64, bool 
async, bool cached)\n");
+                       puts("{");
+                       puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+                       puts("   if (src == RADEON_DOMAIN_VRAM && dst == 
RADEON_DOMAIN_VRAM) {");
+               } else if (placement == 3)
+                       puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == 
RADEON_DOMAIN_GTT) {");
+               else
+                       puts("   } else { /* GTT -> VRAM */");
+
+               for (unsigned mode = 0; mode < 3; mode++) {
+                       bool async = mode == 0;
+                       bool cached = mode == 1;
+
+                       if (async)
+                               puts("      if (async) { /* SDMA or async 
compute */");
+                       else if (cached)
+                               puts("      if (cached) { /* gfx ring */");
+                       else
+                               puts("      } else { /* gfx ring - uncached 
*/");
+
+                       /* The list of best chosen methods. */
+                       struct si_result *methods[32];
+                       unsigned method_max_size[32];
+                       unsigned num_methods = 0;
+
+                       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size 
<<= SIZE_SHIFT) {
+                               /* Find the best method. */
+                               struct si_result *best = NULL;
+
+                               for (unsigned i = 0; i < NUM_METHODS; i++) {
+                                       struct si_result *r = 
&results[util_logbase2(size)][placement][i];
+
+                                       if (!r->is_valid)
+                                               continue;
+
+                                       /* Ban CP DMA clears via MC on <= VI. 
They are super slow
+                                        * on GTT, which we can get due to BO 
evictions.
+                                        */
+                                       if (sctx->chip_class <= VI && placement 
== 1 &&
+                                           r->is_cp && r->cache_policy == 
L2_BYPASS)
+                                               continue;
+
+                                       if (async) {
+                                               /* The following constraints 
for compute IBs try to limit
+                                                * resource usage so as not to 
decrease the performance
+                                                * of gfx IBs too much.
+                                                */
+
+                                               /* Don't use CP DMA on 
asynchronous rings, because
+                                                * the engine is shared with 
gfx IBs.
+                                                */
+                                               if (r->is_cp)
+                                                       continue;
+
+                                               /* Don't use L2 caching on 
asynchronous rings to minimize
+                                                * L2 usage.
+                                                */
+                                               if (r->cache_policy == L2_LRU)
+                                                       continue;
+
+                                               /* Asynchronous compute 
recommends waves_per_sh != 0
+                                                * to limit CU usage. */
+                                               if (r->is_cs && r->waves_per_sh 
== 0)
+                                                       continue;
+                                       } else {
+                                               /* SDMA is always asynchronous 
*/
+                                               if (r->is_sdma)
+                                                       continue;
+
+                                               if (cached && r->cache_policy 
== L2_BYPASS)
+                                                       continue;
+                                               if (!cached && r->cache_policy 
== L2_LRU)
+                                                       continue;
+                                       }
+
+                                       if (!best) {
+                                               best = r;
+                                               continue;
+                                       }
+
+                                       /* Assume some measurement error. 
Earlier methods occupy fewer
+                                        * resources, so the next method is 
always more greedy, and we
+                                        * don't want to select it due to a 
measurement error.
+                                        */
+                                       double min_improvement = 1.03;
+
+                                       if (best->score * min_improvement < 
r->score)
+                                               best = r;
+                               }
+
+                               if (num_methods > 0) {
+                                       unsigned i = num_methods - 1;
+
+                                       /* If the best one is also the best for 
the previous size,
+                                        * just bump the size for the previous 
one.
+                                        *
+                                        * If there is no best, it means all 
methods were too slow
+                                        * for this size and were not tested. 
Use the best one for
+                                        * the previous size.
+                                        */
+                                       if (!best ||
+                                           (methods[i]->is_cp == best->is_cp &&
+                                            methods[i]->is_sdma == 
best->is_sdma &&
+                                            methods[i]->is_cs == best->is_cs &&
+                                            methods[i]->cache_policy == 
best->cache_policy &&
+                                            methods[i]->dwords_per_thread == 
best->dwords_per_thread &&
+                                            methods[i]->waves_per_sh == 
best->waves_per_sh)) {
+                                               method_max_size[i] = size;
+                                               continue;
+                                       }
+                               }
+
+                               /* Add it to the list. */
+                               assert(num_methods < ARRAY_SIZE(methods));
+                               methods[num_methods] = best;
+                               method_max_size[num_methods] = size;
+                               num_methods++;
+                       }
+
+                       for (unsigned i = 0; i < num_methods; i++) {
+                               struct si_result *best = methods[i];
+                               unsigned size = method_max_size[i];
+
+                               /* The size threshold is between the current 
benchmarked
+                                * size and the next benchmarked size. */
+                               if (i < num_methods - 1)
+                                       printf("         if (size <= %u) ", 
(size + (size << SIZE_SHIFT)) / 2);
+                               else
+                                       printf("         ");
+                               printf("return get(");
+
+                               assert(best);
+                               if (best->is_cp) {
+                                       printf("CP_DMA, %s, 0, 0);\n",
+                                              best->cache_policy == L2_BYPASS 
? "L2_BYPASS" :
+                                              best->cache_policy == L2_LRU ? 
"L2_LRU" : "L2_STREAM");
+                               }
+                               if (best->is_sdma)
+                                       printf("SDMA, 0, 0, 0);\n");
+                               if (best->is_cs) {
+                                       printf("COMPUTE, %s, %u, %u);\n",
+                                              best->cache_policy == L2_LRU ? 
"L2_LRU" : "L2_STREAM",
+                                              best->dwords_per_thread,
+                                              best->waves_per_sh);
+                               }
+                       }
+               }
+               puts("      }");
+       }
+       puts("   }");
+       puts("}");
+
+       /* Cleanup. */
+       for (unsigned i = 0; i < NUM_SHADERS; i++)
+               ctx->delete_compute_state(ctx, clear_cs[i]);
+       for (unsigned i = 0; i < NUM_SHADERS; i++)
+               ctx->delete_compute_state(ctx, copy_cs[i]);
+       ctx->destroy(ctx);
+       exit(0);
+}
-- 
2.17.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 7/7] radeonsi: add a thorough clear/copy_buffer benchmark

Reply via email to