[Intel-gfx] [PATCH i-g-t v3] benchmarks/gem_wsim: Command submission workload simulator

Tvrtko Ursulin Wed, 05 Apr 2017 09:14:27 -0700

From: Tvrtko Ursulin <tvrtko.ursu...@intel.com>

Tool which emits batch buffers to engines with configurable
sequences, durations, contexts, dependencies and userspace waits.


Unfinished but shows promise so sending out for early feedback.

v2:
 * Load workload descriptors from files. (also -w)
 * Help text.
 * Calibration control if needed. (-t)
 * NORELOC | LUT to eb flags.
 * Added sample workload to wsim/workload1.

v3:
 * Multiple parallel different workloads (-w -w ...).
 * Multi-context workloads.
 * Variable (random) batch length.
 * Load balancing (round robin and queue depth estimation).
 * Workloads delays and explicit sync steps.
 * Workload frequency (period) control.

TODO list:

 * Fence support.
 * Move majority of help text to README.
 * Better error handling.
 * Less 1980's workload parsing.
 * Proper workloads.
 * Explicit waits?
 * Threads?
 * ... ?

Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
Cc: Chris Wilson <ch...@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozh...@intel.com>
---
 benchmarks/Makefile.sources |    1 +
 benchmarks/gem_wsim.c       | 1053 +++++++++++++++++++++++++++++++++++++++++++
 benchmarks/wsim/workload1   |    7 +
 benchmarks/wsim/workload2   |    7 +
 benchmarks/wsim/workload3   |    7 +
 benchmarks/wsim/workload4   |    8 +
 benchmarks/wsim/workload5   |    8 +
 benchmarks/wsim/workload6   |    8 +
 8 files changed, 1099 insertions(+)
 create mode 100644 benchmarks/gem_wsim.c
 create mode 100644 benchmarks/wsim/workload1
 create mode 100644 benchmarks/wsim/workload2
 create mode 100644 benchmarks/wsim/workload3
 create mode 100644 benchmarks/wsim/workload4
 create mode 100644 benchmarks/wsim/workload5
 create mode 100644 benchmarks/wsim/workload6

diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources
index 3af54ebe36f2..3a941150abb3 100644
--- a/benchmarks/Makefile.sources
+++ b/benchmarks/Makefile.sources
@@ -14,6 +14,7 @@ benchmarks_prog_list =                        \
        gem_prw                         \
        gem_set_domain                  \
        gem_syslatency                  \
+       gem_wsim                        \
        kms_vblank                      \
        prime_lookup                    \
        vgem_mmap                       \
diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
new file mode 100644
index 000000000000..38041da1f6e3
--- /dev/null
+++ b/benchmarks/gem_wsim.c
@@ -0,0 +1,1053 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <time.h>
+#include <assert.h>
+#include <limits.h>
+
+
+#include "intel_chipset.h"
+#include "drm.h"
+#include "ioctl_wrappers.h"
+#include "drmtest.h"
+#include "intel_io.h"
+
+enum intel_engine_id {
+       RCS,
+       BCS,
+       VCS,
+       VCS1,
+       VCS2,
+       VECS,
+       NUM_ENGINES
+};
+
+struct duration {
+       unsigned int min, max, cur;
+};
+
+enum w_type
+{
+       BATCH,
+       SYNC,
+       DELAY,
+       PERIOD
+};
+
+struct w_step
+{
+       /* Workload step metadata */
+       enum w_type type;
+       unsigned int context;
+       unsigned int engine;
+       struct duration duration;
+       int dependency;
+       int wait;
+
+       /* Implementation details */
+       struct drm_i915_gem_execbuffer2 eb;
+       struct drm_i915_gem_exec_object2 obj[4];
+       struct drm_i915_gem_relocation_entry reloc;
+       unsigned long bb_sz;
+       uint32_t bb_handle;
+       uint64_t seqno_offset;
+};
+
+struct workload
+{
+       unsigned int nr_steps;
+       struct w_step *steps;
+
+       struct timespec repeat_start;
+
+       unsigned int nr_ctxs;
+       uint32_t *ctx_id;
+
+       unsigned long seqno[NUM_ENGINES];
+       uint32_t status_page_handle[NUM_ENGINES];
+       uint32_t *status_page[NUM_ENGINES];
+       unsigned int vcs_rr;
+
+       unsigned long qd_sum[NUM_ENGINES];
+       unsigned long nr_bb[NUM_ENGINES];
+};
+
+static const unsigned int eb_engine_map[NUM_ENGINES] = {
+       [RCS] = I915_EXEC_RENDER,
+       [BCS] = I915_EXEC_BLT,
+       [VCS] = I915_EXEC_BSD,
+       [VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1,
+       [VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2,
+       [VECS] = I915_EXEC_VEBOX
+};
+
+static const unsigned int nop_calibration_us = 1000;
+static unsigned long nop_calibration;
+
+static bool quiet;
+static int fd;
+
+/*
+ * Workload descriptor:
+ *
+ * ctx.engine.duration.dependency.wait,...
+ * <uint>.<str>.<uint>.<int <= 0>.<0|1>,...
+ *
+ * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS
+ *
+ * 
"1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1"
+ */
+
+static const char *ring_str_map[NUM_ENGINES] = {
+       [RCS] = "RCS",
+       [BCS] = "BCS",
+       [VCS] = "VCS",
+       [VCS1] = "VCS1",
+       [VCS2] = "VCS2",
+       [VECS] = "VECS",
+};
+
+static struct workload *parse_workload(char *_desc)
+{
+       struct workload *wrk;
+       unsigned int nr_steps = 0;
+       char *desc = strdup(_desc);
+       char *_token, *token, *tctx = NULL, *tstart = desc;
+       char *field, *fctx = NULL, *fstart;
+       struct w_step step = { }, *steps = NULL;
+       unsigned int valid;
+       int tmp;
+
+       while ((_token = strtok_r(tstart, ",", &tctx)) != NULL) {
+               tstart = NULL;
+               token = strdup(_token);
+               fstart = token;
+               valid = 0;
+
+               if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+                       fstart = NULL;
+
+                       if (!strcasecmp(field, "d")) {
+                               if ((field = strtok_r(fstart, ".", &fctx)) !=
+                                   NULL) {
+                                       tmp = atoi(field);
+                                       if (tmp <= 0) {
+                                               if (!quiet)
+                                                       fprintf(stderr,
+                                                               "Invalid delay 
at step %u!\n",
+                                                               nr_steps);
+                                               return NULL;
+                                       }
+
+                                       step.type = DELAY;
+                                       step.wait = tmp;
+                                       goto add_step;
+                               }
+                       } else if (!strcasecmp(field, "p")) {
+                               if ((field = strtok_r(fstart, ".", &fctx)) !=
+                                   NULL) {
+                                       tmp = atoi(field);
+                                       if (tmp <= 0) {
+                                               if (!quiet)
+                                                       fprintf(stderr,
+                                                               "Invalid period 
at step %u!\n",
+                                                               nr_steps);
+                                               return NULL;
+                                       }
+
+                                       step.type = PERIOD;
+                                       step.wait = tmp;
+                                       goto add_step;
+                               }
+                       } else if (!strcasecmp(field, "s")) {
+                               if ((field = strtok_r(fstart, ".", &fctx)) !=
+                                   NULL) {
+                                       tmp = atoi(field);
+                                       if (tmp >= 0) {
+                                               if (!quiet)
+                                                       fprintf(stderr,
+                                                               "Invalid sync 
target at step %u!\n",
+                                                               nr_steps);
+                                               return NULL;
+                                       }
+
+                                       step.type = SYNC;
+                                       step.wait = tmp;
+                                       goto add_step;
+                               }
+                       }
+
+                       tmp = atoi(field);
+                       if (tmp < 0) {
+                               if (!quiet)
+                                       fprintf(stderr,
+                                               "Invalid ctx id at step %u!\n",
+                                               nr_steps);
+                               return NULL;
+                       }
+                       step.context = tmp;
+
+                       valid++;
+               }
+
+               if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+                       unsigned int i, old_valid = valid;
+
+                       fstart = NULL;
+
+                       for (i = 0; i < ARRAY_SIZE(ring_str_map); i++) {
+                               if (!strcasecmp(field, ring_str_map[i])) {
+                                       step.engine = i;
+                                       valid++;
+                                       break;
+                               }
+                       }
+
+                       if (old_valid == valid) {
+                               if (!quiet)
+                                       fprintf(stderr,
+                                               "Invalid engine id at step 
%u!\n",
+                                               nr_steps);
+                               return NULL;
+                       }
+               }
+
+               if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+                       char *sep = NULL;
+                       long int tmpl;
+
+                       fstart = NULL;
+
+                       tmpl = strtol(field, &sep, 10);
+                       if (tmpl == LONG_MIN || tmpl == LONG_MAX) {
+                               if (!quiet)
+                                       fprintf(stderr,
+                                               "Invalid duration at step 
%u!\n",
+                                               nr_steps);
+                               return NULL;
+                       }
+                       step.duration.min = tmpl;
+
+                       if (sep && *sep == '-') {
+                               tmpl = strtol(sep + 1, NULL, 10);
+                               if (tmpl == LONG_MIN || tmpl == LONG_MAX) {
+                                       if (!quiet)
+                                               fprintf(stderr,
+                                                       "Invalid duration range 
at step %u!\n",
+                                                       nr_steps);
+                                       return NULL;
+                               }
+                               step.duration.max = tmpl;
+                       } else {
+                               step.duration.max = step.duration.min;
+                       }
+
+                       valid++;
+               }
+
+               if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+                       fstart = NULL;
+
+                       tmp = atoi(field);
+                       if (tmp > 0) {
+                               if (!quiet)
+                                       fprintf(stderr,
+                                               "Invalid forward dependency at 
step %u!\n",
+                                               nr_steps);
+                               return NULL;
+                       }
+                       step.dependency = tmp;
+
+                       valid++;
+               }
+
+               if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+                       fstart = NULL;
+
+                       tmp = atoi(field);
+                       if (tmp != 0 && tmp != 1) {
+                               if (!quiet)
+                                       fprintf(stderr,
+                                               "Invalid wait boolean at step 
%u!\n",
+                                               nr_steps);
+                               return NULL;
+                       }
+                       step.wait = tmp;
+
+                       valid++;
+               }
+
+               if (valid != 5) {
+                       if (!quiet)
+                               fprintf(stderr, "Invalid record at step %u!\n",
+                                       nr_steps);
+                       return NULL;
+               }
+
+               step.type = BATCH;
+
+add_step:
+               nr_steps++;
+               steps = realloc(steps, sizeof(step) * nr_steps);
+               igt_assert(steps);
+
+               memcpy(&steps[nr_steps - 1], &step, sizeof(step));
+
+               free(token);
+       }
+
+       wrk = malloc(sizeof(*wrk));
+       igt_assert(wrk);
+
+       wrk->nr_steps = nr_steps;
+       wrk->steps = steps;
+
+       free(desc);
+
+       return wrk;
+}
+
+static struct workload *
+clone_workload(struct workload *_wrk)
+{
+       struct workload *wrk;
+
+       wrk = malloc(sizeof(*wrk));
+       igt_assert(wrk);
+       memset(wrk, 0, sizeof(*wrk));
+
+       wrk->nr_steps = _wrk->nr_steps;
+       wrk->steps = calloc(wrk->nr_steps, sizeof(struct w_step));
+       igt_assert(wrk->steps);
+
+       memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps);
+
+       return wrk;
+}
+
+#define rounddown(x, y) (x - (x%y))
+#ifndef PAGE_SIZE
+#define PAGE_SIZE (4096)
+#endif
+
+static unsigned int get_duration(struct duration *dur)
+{
+       if (dur->min == dur->max)
+               return dur->min;
+       else
+               return dur->min + rand() % (dur->max + 1 - dur->min);
+}
+
+static unsigned long __get_bb_sz(unsigned int duration)
+{
+       return ALIGN(duration * nop_calibration * sizeof(uint32_t) /
+                    nop_calibration_us, sizeof(uint32_t));
+}
+
+static unsigned long get_bb_sz(struct duration *dur)
+{
+       return __get_bb_sz(dur->cur);
+}
+
+static void
+__emit_bb_end(struct w_step *w, bool terminate, bool seqnos, uint32_t seqno)
+{
+       const uint32_t bbe = 0xa << 23;
+       unsigned long bb_sz = get_bb_sz(&w->duration);
+       unsigned long mmap_start, cmd_offset, mmap_len;
+       uint32_t *ptr, *cs;
+
+       mmap_len = (seqnos ? 5 : 1) * sizeof(uint32_t);
+       cmd_offset = bb_sz - mmap_len;
+       mmap_start = rounddown(cmd_offset, PAGE_SIZE);
+       mmap_len += cmd_offset - mmap_start;
+
+       gem_set_domain(fd, w->bb_handle,
+                      I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
+
+       ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
+       cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start);
+
+       if (seqnos) {
+               const int gen = intel_gen(intel_get_drm_devid(fd));
+
+               igt_assert(gen >= 8);
+
+               w->reloc.offset = bb_sz - 4 * sizeof(uint32_t);
+               w->seqno_offset = bb_sz - 2 * sizeof(uint32_t);
+
+               *cs++ = terminate ? MI_STORE_DWORD_IMM : 0;
+               *cs++ = 0;
+               *cs++ = 0;
+               *cs++ = seqno;
+       }
+
+       *cs = terminate ? bbe : 0;
+
+       munmap(ptr, mmap_len);
+}
+
+static void terminate_bb(struct w_step *w, bool seqnos, uint32_t seqno)
+{
+       __emit_bb_end(w, true, seqnos, seqno);
+}
+
+static void unterminate_bb(struct w_step *w, bool seqnos)
+{
+       __emit_bb_end(w, false, seqnos, 0);
+}
+
+static void
+prepare_workload(struct workload *wrk, bool swap_vcs, bool seqnos)
+{
+       int max_ctx = -1;
+       struct w_step *w;
+       int i;
+
+       if (seqnos) {
+               const unsigned int status_sz = sizeof(uint32_t);
+
+               for (i = 0; i < NUM_ENGINES; i++) {
+                       wrk->status_page_handle[i] = gem_create(fd, status_sz);
+                       wrk->status_page[i] =
+                               gem_mmap__cpu(fd, wrk->status_page_handle[i],
+                                             0, status_sz, PROT_READ);
+               }
+       }
+
+       for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+               if ((int)w->context > max_ctx) {
+                       int delta = w->context + 1 - wrk->nr_ctxs;
+
+                       wrk->nr_ctxs += delta;
+                       wrk->ctx_id = realloc(wrk->ctx_id,
+                                             wrk->nr_ctxs * sizeof(uint32_t));
+                       memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0,
+                              delta * sizeof(uint32_t));
+
+                       max_ctx = w->context;
+               }
+
+               if (!wrk->ctx_id[w->context]) {
+                       struct drm_i915_gem_context_create arg = {};
+
+                       drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
+                       igt_assert(arg.ctx_id);
+
+                       wrk->ctx_id[w->context] = arg.ctx_id;
+               }
+       }
+
+       for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+               enum intel_engine_id engine = w->engine;
+               unsigned int bb_i, j = 0;
+
+               if (w->type != BATCH)
+                       continue;
+
+               w->obj[j].handle = gem_create(fd, 4096);
+               w->obj[j].flags = EXEC_OBJECT_WRITE;
+               j++;
+
+               if (seqnos) {
+                       w->obj[j].handle = wrk->status_page_handle[engine];
+                       w->obj[j].flags = EXEC_OBJECT_WRITE;
+                       j++;
+               }
+
+               bb_i = j++;
+               w->duration.cur = w->duration.max;
+               w->bb_sz = get_bb_sz(&w->duration);
+               w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz);
+               terminate_bb(w, seqnos, 0);
+               if (seqnos) {
+                       w->reloc.presumed_offset = -1;
+                       w->reloc.target_handle = 1;
+                       w->reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION;
+                       w->reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION;
+               }
+
+               igt_assert(w->dependency <= 0);
+               if (w->dependency) {
+                       int dep_idx = i + w->dependency;
+
+                       igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
+                       igt_assert(wrk->steps[dep_idx].type == BATCH);
+
+                       w->obj[j].handle = w->obj[bb_i].handle;
+                       bb_i = j;
+                       w->obj[j - 1].handle =
+                                       wrk->steps[dep_idx].obj[0].handle;
+                       j++;
+               }
+
+               if (seqnos) {
+                       w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc);
+                       w->obj[bb_i].relocation_count = 1;
+               }
+
+               w->eb.buffers_ptr = to_user_pointer(w->obj);
+               w->eb.buffer_count = j;
+               w->eb.rsvd1 = wrk->ctx_id[w->context];
+
+               if (swap_vcs && engine == VCS1)
+                       engine = VCS2;
+               else if (swap_vcs && engine == VCS2)
+                       engine = VCS1;
+               w->eb.flags = eb_engine_map[engine];
+               w->eb.flags |= I915_EXEC_HANDLE_LUT;
+               if (!seqnos)
+                       w->eb.flags |= I915_EXEC_NO_RELOC;
+#ifdef DEBUG
+               printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] 
ctx[%u]=%u\n",
+                      i, w->eb.buffer_count, w->obj[0].handle,
+                      w->obj[1].handle, w->obj[2].handle, w->obj[3].handle,
+                      w->bb_sz, w->eb.flags, w->bb_handle, bb_i,
+                      w->context, wrk->ctx_id[w->context]);
+#endif
+       }
+}
+
+static double elapsed(const struct timespec *start, const struct timespec *end)
+{
+       return (end->tv_sec - start->tv_sec) +
+              (end->tv_nsec - start->tv_nsec) / 1e9;
+}
+
+static int elapsed_us(const struct timespec *start, const struct timespec *end)
+{
+       return (1e9 * (end->tv_sec - start->tv_sec) +
+              (end->tv_nsec - start->tv_nsec)) / 1e3;
+}
+
+static enum intel_engine_id
+rr_balance(struct workload *wrk, struct w_step *w)
+{
+       unsigned int engine;
+
+       if (wrk->vcs_rr)
+               engine = VCS2;
+       else
+               engine = VCS1;
+
+       wrk->vcs_rr ^= 1;
+
+       return engine;
+}
+
+static enum intel_engine_id
+qd_balance(struct workload *wrk, struct w_step *w)
+{
+       unsigned long qd[NUM_ENGINES];
+       enum intel_engine_id engine = w->engine;
+
+       igt_assert(engine == VCS);
+
+       qd[VCS1] = wrk->seqno[VCS1] - wrk->status_page[VCS1][0];
+       wrk->qd_sum[VCS1] += qd[VCS1];
+
+       qd[VCS2] = wrk->seqno[VCS2] - wrk->status_page[VCS2][0];
+       wrk->qd_sum[VCS2] += qd[VCS2];
+
+       if (qd[VCS1] < qd[VCS2]) {
+               engine = VCS1;
+               wrk->vcs_rr = 0;
+       } else if (qd[VCS2] < qd[VCS1]) {
+               engine = VCS2;
+               wrk->vcs_rr = 1;
+       } else {
+               unsigned int vcs = wrk->vcs_rr ^ 1;
+
+               wrk->vcs_rr = vcs;
+
+               if (vcs == 0)
+                       engine = VCS1;
+               else
+                       engine = VCS2;
+       }
+
+// printf("qd_balance: 1:%lu 2:%lu rr:%u = %u\n", qd[VCS1], qd[VCS2], 
wrk->vcs_rr, engine);
+
+       return engine;
+}
+
+static void update_bb_seqno(struct w_step *w, uint32_t seqno)
+{
+       unsigned long mmap_start, mmap_offset, mmap_len;
+       void *ptr;
+
+       mmap_start = rounddown(w->seqno_offset, PAGE_SIZE);
+       mmap_offset = w->seqno_offset - mmap_start;
+       mmap_len = sizeof(uint32_t) + mmap_offset;
+
+       gem_set_domain(fd, w->bb_handle,
+                      I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
+
+       ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
+
+       *(uint32_t *)((char *)ptr + mmap_offset) = seqno;
+
+       munmap(ptr, mmap_len);
+}
+
+static void
+run_workload(unsigned int id, struct workload *wrk, unsigned int repeat,
+            enum intel_engine_id (*balance)(struct workload *wrk,
+                                            struct w_step *w), bool seqnos)
+{
+       struct timespec t_start, t_end;
+       struct w_step *w;
+       double t;
+       int i, j;
+
+       clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+       srand(t_start.tv_nsec);
+
+       for (j = 0; j < repeat; j++) {
+               for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+                       enum intel_engine_id engine = w->engine;
+                       uint32_t seqno;
+                       bool seqno_updated = false;
+                       int do_sleep = 0;
+
+                       if (i == 0)
+                               clock_gettime(CLOCK_MONOTONIC,
+                                             &wrk->repeat_start);
+
+                       if (w->type == DELAY) {
+                               do_sleep = w->wait;
+                       } else if (w->type == PERIOD) {
+                               struct timespec now;
+
+                               clock_gettime(CLOCK_MONOTONIC, &now);
+                               do_sleep = w->wait -
+                                          elapsed_us(&wrk->repeat_start, &now);
+                               if (do_sleep < 0) {
+                                       if (!quiet) {
+                                               printf("%u: Dropped period @ 
%u/%u (%dus late)!\n",
+                                                      id, j, i, do_sleep);
+                                               continue;
+                                       }
+                               }
+                       } else if (w->type == SYNC) {
+                               unsigned int s_idx = i + w->wait;
+
+                               igt_assert(i > 0 && i < wrk->nr_steps);
+                               igt_assert(wrk->steps[s_idx].type == BATCH);
+                               gem_sync(fd, wrk->steps[s_idx].obj[0].handle);
+                               continue;
+                       }
+
+                       if (do_sleep) {
+                               usleep(do_sleep);
+                               continue;
+                       }
+
+                       wrk->nr_bb[engine]++;
+
+                       if (engine == VCS && balance) {
+                               engine = balance(wrk, w);
+                               wrk->nr_bb[engine]++;
+
+                               w->obj[1].handle = 
wrk->status_page_handle[engine];
+
+                               w->eb.flags = eb_engine_map[engine];
+                               w->eb.flags |= I915_EXEC_HANDLE_LUT;
+                       }
+
+                       seqno = ++wrk->seqno[engine];
+
+                       if (w->duration.min != w->duration.max) {
+                               unsigned int cur = get_duration(&w->duration);
+
+                               if (cur != w->duration.cur) {
+                                       unterminate_bb(w, seqnos);
+                                       w->duration.cur = cur;
+                                       terminate_bb(w, seqnos, seqno);
+                                       seqno_updated = true;
+                               }
+                       }
+
+                       if (seqnos && !seqno_updated)
+                               update_bb_seqno(w, seqno);
+
+                       gem_execbuf(fd, &w->eb);
+
+                       if (w->wait)
+                               gem_sync(fd, w->obj[0].handle);
+               }
+       }
+
+       clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+       t = elapsed(&t_start, &t_end);
+       if (!quiet && !balance)
+               printf("%u: %.3fs elapsed (%.3f workloads/s)\n", id, t, repeat 
/ t);
+       if (!quiet && balance == rr_balance)
+               printf("%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) 
total VCS batches.\n",
+                      id, t, repeat / t,
+                      wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2]);
+       if (!quiet && balance == qd_balance)
+               printf("%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) 
total VCS batches. Average queue depths %.3f, %.3f.\n",
+                      id, t, repeat / t,
+                      wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2],
+                      (double)wrk->qd_sum[VCS1] / wrk->nr_bb[VCS],
+                      (double)wrk->qd_sum[VCS2] / wrk->nr_bb[VCS]);
+}
+
+static void fini_workload(struct workload *wrk)
+{
+       free(wrk->steps);
+       free(wrk);
+}
+
+static unsigned long calibrate_nop(unsigned int tolerance_pct)
+{
+       const uint32_t bbe = 0xa << 23;
+       unsigned int loops = 17;
+       unsigned int usecs = nop_calibration_us;
+       struct drm_i915_gem_exec_object2 obj = {};
+       struct drm_i915_gem_execbuffer2 eb =
+               { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
+       long size, last_size;
+       struct timespec t_0, t_end;
+
+       clock_gettime(CLOCK_MONOTONIC, &t_0);
+
+       size = 256 * 1024;
+       do {
+               struct timespec t_start;
+
+               obj.handle = gem_create(fd, size);
+               gem_write(fd, obj.handle, size - sizeof(bbe), &bbe,
+                         sizeof(bbe));
+               gem_execbuf(fd, &eb);
+               gem_sync(fd, obj.handle);
+
+               clock_gettime(CLOCK_MONOTONIC, &t_start);
+               for (int loop = 0; loop < loops; loop++)
+                       gem_execbuf(fd, &eb);
+               gem_sync(fd, obj.handle);
+               clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+               gem_close(fd, obj.handle);
+
+               last_size = size;
+               size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs;
+               size = ALIGN(size, sizeof(uint32_t));
+       } while (elapsed(&t_0, &t_end) < 5 ||
+                abs(size - last_size) > (size * tolerance_pct / 100));
+
+       return size / sizeof(uint32_t);
+}
+
+static void print_help(void)
+{
+       puts(
+"Usage: gem_wsim [OPTIONS]\n"
+"\n"
+"Runs a simulated workload on the GPU.\n"
+"When ran without arguments performs a GPU calibration result of which needs\n"
+"to be provided when running the simulation in subsequent invocations.\n"
+"\n"
+"Options:\n"
+"      -h              This text.\n"
+"      -q              Be quiet - do not output anything to stdout.\n"
+"      -n <n>          Nop calibration value.\n"
+"      -t <n>          Nop calibration tolerance percentage.\n"
+"                      Use when there is a difficuly obtaining calibration\n"
+"                      with the default settings.\n"
+"      -w <desc|path>  Filename or a workload descriptor.\n"
+"                      Can be given multiple times.\n"
+"      -r <n>          How many times to emit the workload.\n"
+"      -c <n>          Fork n clients emitting the workload simultaneously.\n"
+"      -x              Swap VCS1 and VCS2 engines in every other client.\n"
+"      -s              Track batch sequence numbers.\n"
+"      -b <n>          Load balancing to use. (0: rr, 1: qd)\n"
+"\n"
+"Workload descriptor format:\n"
+"\n"
+"      ctx.engine.duration_us.dependency.wait,...\n"
+"      <uint>.<str>.<uint>[-<uint>].<int <= 0>.<0|1>,...\n"
+"      d|p.<uiny>,...\n"
+"\n"
+"      For duration a range can be given from which a random value will be\n"
+"      picked before every submit. Since this and seqno management requirea\n"
+"      CPU access to objects care needs to be taken in order to ensure the\n"
+"      submit queue is deep enough these operations do not affect the\n"
+"      execution speed unless that is desired.\n"
+"\n"
+"      Additional workload steps are also supported:\n"
+"        * 'd' - adds a delay (in microseconds).\n"
+"        * 'p' - adds a delay relative to the start of previous loop so that\n"
+"                the each loop starts execution with a given period.\n"
+"        * 's' - synchronises the pipeline to a batch relative to the step.\n"
+"\n"
+"      Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS\n"
+"\n"
+"Example:\n"
+"      1.VCS1.3000.0.1\n"
+"      1.RCS.500-1000.-1.0\n"
+"      d.1000\n"
+"      1.RCS.3700.0.0\n"
+"      1.RCS.1000.-2.0\n"
+"      1.VCS2.2300.-2.0\n"
+"      1.RCS.4700.-1.0\n"
+"      1.VCS2.600.-1.1\n"
+"      p.16000\n"
+"\n"
+"The above workload described in human language works like this:\n"
+"A batch is sent to the VCS1 engine which will be executing for 3ms on the\n"
+"GPU and userspace will wait until it is finished before proceeding.\n"
+"Now three batches are sent to RCS with durations of 0.5-1.5ms (random, 
3.7ms\n"
+"and 1ms respectively. The first batch has a data dependency on the 
preceding\n"
+"VCS1 batch, and the last of the group depends on the first from the group.\n"
+"Now a 2.3ms batch is sent to VCS2, with a data dependency on the 3.7ms RCS\n"
+"batch, followed by a 4.7ms RCS batch with a data dependency on the 2.3ms\n"
+"VCS2 batch, and finally a 0.6ms VCS2 batch depending on the previous RCS 
one.\n"
+"The tool is then told to wait for the last one to complete before 
optionally\n"
+"starting the next iteration (-r).\n"
+"\n"
+"When workload descriptors are provided on the command line, commas must be\n"
+"used instead of newlines.\n"
+       );
+}
+
+static char *load_workload_descriptor(char *filename)
+{
+       struct stat sbuf;
+       char *buf;
+       int infd, ret, i;
+       ssize_t len;
+
+       ret = stat(filename, &sbuf);
+       if (ret || !S_ISREG(sbuf.st_mode))
+               return filename;
+
+       igt_assert(sbuf.st_size < 1024 * 1024); /* Just so. */
+       buf = malloc(sbuf.st_size);
+       igt_assert(buf);
+
+       infd = open(filename, O_RDONLY);
+       igt_assert(infd >= 0);
+       len = read(infd, buf, sbuf.st_size);
+       igt_assert(len == sbuf.st_size);
+       close(infd);
+
+       for (i = 0; i < len; i++) {
+               if (buf[i] == '\n')
+                       buf[i] = ',';
+       }
+
+       len--;
+       while (buf[len] == ',')
+               buf[len--] = 0;
+
+       return buf;
+}
+
+static char **
+add_workload_arg(char **w_args, unsigned int nr_args, char *w_arg)
+{
+       w_args = realloc(w_args, sizeof(char *) * nr_args);
+       igt_assert(w_args);
+       w_args[nr_args - 1] = w_arg;
+
+       return w_args;
+}
+
+int main(int argc, char **argv)
+{
+       unsigned int repeat = 1;
+       unsigned int clients = 1;
+       bool seqnos = false;
+       bool swap_vcs = false;
+       struct timespec t_start, t_end;
+       struct workload **w, **wrk = NULL;
+       unsigned int nr_w_args = 0;
+       char **w_args = NULL;
+       unsigned int tolerance_pct = 1;
+       enum intel_engine_id (*balance)(struct workload *, struct w_step *) = 
NULL;
+       double t;
+       int i, c;
+
+       fd = drm_open_driver(DRIVER_INTEL);
+
+       while ((c = getopt(argc, argv, "c:n:r:qxw:t:sb:h")) != -1) {
+               switch (c) {
+               case 'w':
+                       w_args = add_workload_arg(w_args, ++nr_w_args, optarg);
+                       break;
+               case 'c':
+                       clients = strtol(optarg, NULL, 0);
+                       break;
+               case 't':
+                       tolerance_pct = strtol(optarg, NULL, 0);
+                       break;
+               case 'n':
+                       nop_calibration = strtol(optarg, NULL, 0);
+                       break;
+               case 'r':
+                       repeat = strtol(optarg, NULL, 0);
+                       break;
+               case 'q':
+                       quiet = true;
+                       break;
+               case 'x':
+                       swap_vcs = true;
+                       break;
+               case 's':
+                       seqnos = true;
+                       break;
+               case 'b':
+                       switch (strtol(optarg, NULL, 0)) {
+                       case 0:
+                               balance = rr_balance;
+                               break;
+                       case 1:
+                               balance = qd_balance;
+                               break;
+                       default:
+                               if (!quiet)
+                                       fprintf(stderr,
+                                               "Unknown balancing mode 
'%s'!\n",
+                                               optarg);
+                               return 1;
+                       }
+                       break;
+               case 'h':
+                       print_help();
+                       return 0;
+               default:
+                       return 1;
+               }
+       }
+
+       if (!nop_calibration) {
+               if (!quiet)
+                       printf("Calibrating nop delay with %u%% tolerance...\n",
+                               tolerance_pct);
+               nop_calibration = calibrate_nop(tolerance_pct);
+               if (!quiet)
+                       printf("Nop calibration for %uus delay is %lu.\n",
+                              nop_calibration_us, nop_calibration);
+
+               return 0;
+       }
+
+       if (!nr_w_args) {
+               if (!quiet)
+                       fprintf(stderr, "No workload descriptor(s)!\n");
+               return 1;
+       }
+
+       if (nr_w_args > 1 && clients > 1) {
+               if (!quiet)
+                       fprintf(stderr,
+                               "Cloned clients cannot be combined with 
multiple workloads!\n");
+               return 1;
+       }
+
+       wrk = calloc(nr_w_args, sizeof(*wrk));
+       igt_assert(wrk);
+
+       for (i = 0; i < nr_w_args; i++) {
+               w_args[i] = load_workload_descriptor(w_args[i]);
+               if (!w_args[i]) {
+                       if (!quiet)
+                               fprintf(stderr,
+                                       "Failed to load workload descriptor 
%u!\n",
+                                       i);
+                       return 1;
+               }
+
+               wrk[i] = parse_workload(w_args[i]);
+               if (!wrk[i]) {
+                       if (!quiet)
+                               fprintf(stderr,
+                                       "Failed to parse workload %u!\n", i);
+                       return 1;
+               }
+       }
+
+       if (!quiet) {
+               printf("Using %lu nop calibration for %uus delay.\n",
+                      nop_calibration, nop_calibration_us);
+               if (nr_w_args > 1)
+                       clients = nr_w_args;
+               printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
+               if (swap_vcs)
+                       printf("Swapping VCS rings between clients.\n");
+       }
+
+       if (balance && !seqnos) {
+               if (!quiet)
+                       fprintf(stderr, "Seqnos are required for 
load-balancing!\n");
+               return 1;
+       }
+
+       w = calloc(clients, sizeof(struct workload *));
+       igt_assert(w);
+
+       for (i = 0; i < clients; i++) {
+               w[i] = clone_workload(wrk[nr_w_args > 1 ? i : 0]);
+               prepare_workload(w[i], swap_vcs && (i & 1), seqnos);
+       }
+
+       clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+       igt_fork(child, clients)
+               run_workload(child, w[child], repeat, balance, seqnos);
+
+       igt_waitchildren();
+
+       clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+       t = elapsed(&t_start, &t_end);
+       if (!quiet)
+               printf("%.3fs elapsed (%.3f workloads/s)\n",
+                      t, clients * repeat / t);
+
+       for (i = 0; i < clients; i++)
+               fini_workload(w[i]);
+       free(w);
+       for (i = 0; i < nr_w_args; i++)
+               fini_workload(wrk[i]);
+       free(w_args);
+
+       return 0;
+}
diff --git a/benchmarks/wsim/workload1 b/benchmarks/wsim/workload1
new file mode 100644
index 000000000000..5f533d8e168b
--- /dev/null
+++ b/benchmarks/wsim/workload1
@@ -0,0 +1,7 @@
+1.VCS1.3000.0.1
+1.RCS.1000.-1.0
+1.RCS.3700.0.0
+1.RCS.1000.-2.0
+1.VCS2.2300.-2.0
+1.RCS.4700.-1.0
+1.VCS2.600.-1.1
diff --git a/benchmarks/wsim/workload2 b/benchmarks/wsim/workload2
new file mode 100644
index 000000000000..25a692032eae
--- /dev/null
+++ b/benchmarks/wsim/workload2
@@ -0,0 +1,7 @@
+1.VCS.3000.0.1
+1.RCS.1000.-1.0
+1.RCS.3700.0.0
+1.RCS.1000.-2.0
+1.VCS.2300.-2.0
+1.RCS.4700.-1.0
+1.VCS.600.-1.1
diff --git a/benchmarks/wsim/workload3 b/benchmarks/wsim/workload3
new file mode 100644
index 000000000000..bc9f6df52775
--- /dev/null
+++ b/benchmarks/wsim/workload3
@@ -0,0 +1,7 @@
+1.VCS.3000.0.0
+1.RCS.500-1500.-1.0
+0.RCS.3700.0.0
+1.RCS.1000.-2.0
+1.VCS.2300.-2.0
+2.RCS.4700.-1.0
+1.VCS.600.-1.0
diff --git a/benchmarks/wsim/workload4 b/benchmarks/wsim/workload4
new file mode 100644
index 000000000000..3e4720a6949c
--- /dev/null
+++ b/benchmarks/wsim/workload4
@@ -0,0 +1,8 @@
+1.VCS.3000.0.0
+1.RCS.500-1500.-1.0
+d.1000
+0.RCS.3700.0.0
+1.RCS.1000.-3.0
+1.VCS.2300.-2.0
+2.RCS.4700.-1.0
+1.VCS.600.-1.0
diff --git a/benchmarks/wsim/workload5 b/benchmarks/wsim/workload5
new file mode 100644
index 000000000000..65440a8264ef
--- /dev/null
+++ b/benchmarks/wsim/workload5
@@ -0,0 +1,8 @@
+1.VCS.3000.0.0
+1.RCS.500-1500.-1.0
+0.RCS.3700.0.0
+1.RCS.1000.-2.0
+1.VCS.2300.-2.0
+2.RCS.4700.-1.0
+1.VCS.600.-1.0
+p.16000
diff --git a/benchmarks/wsim/workload6 b/benchmarks/wsim/workload6
new file mode 100644
index 000000000000..d5b7141dfdd0
--- /dev/null
+++ b/benchmarks/wsim/workload6
@@ -0,0 +1,8 @@
+1.VCS.3000.0.0
+1.RCS.500-1500.-1.0
+s.-1
+0.RCS.3700.0.0
+1.RCS.1000.-3.0
+1.VCS.2300.-2.0
+2.RCS.4700.-1.0
+1.VCS.600.-1.0
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

[Intel-gfx] [PATCH i-g-t v3] benchmarks/gem_wsim: Command submission workload simulator

Reply via email to