Re: [Intel-gfx] [PATCH i-g-t v3] benchmarks/gem_wsim: Command submission workload simulator

Chris Wilson Wed, 05 Apr 2017 09:48:27 -0700

On Wed, Apr 05, 2017 at 05:14:01PM +0100, Tvrtko Ursulin wrote:
> +static void
> +__emit_bb_end(struct w_step *w, bool terminate, bool seqnos, uint32_t seqno)
> +{
> +     const uint32_t bbe = 0xa << 23;
> +     unsigned long bb_sz = get_bb_sz(&w->duration);
> +     unsigned long mmap_start, cmd_offset, mmap_len;
> +     uint32_t *ptr, *cs;
> +
> +     mmap_len = (seqnos ? 5 : 1) * sizeof(uint32_t);
> +     cmd_offset = bb_sz - mmap_len;
> +     mmap_start = rounddown(cmd_offset, PAGE_SIZE);
> +     mmap_len += cmd_offset - mmap_start;
> +
> +     gem_set_domain(fd, w->bb_handle,
> +                    I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> +
> +     ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
> +     cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start);
> +
> +     if (seqnos) {
> +             const int gen = intel_gen(intel_get_drm_devid(fd));
> +
> +             igt_assert(gen >= 8);
> +
> +             w->reloc.offset = bb_sz - 4 * sizeof(uint32_t);
> +             w->seqno_offset = bb_sz - 2 * sizeof(uint32_t);
> +
> +             *cs++ = terminate ? MI_STORE_DWORD_IMM : 0;
> +             *cs++ = 0;
> +             *cs++ = 0;
> +             *cs++ = seqno;
> +     }
> +
> +     *cs = terminate ? bbe : 0;
> +
> +     munmap(ptr, mmap_len);
> +}
> +
> +static void terminate_bb(struct w_step *w, bool seqnos, uint32_t seqno)
> +{
> +     __emit_bb_end(w, true, seqnos, seqno);
> +}
> +
> +static void unterminate_bb(struct w_step *w, bool seqnos)
> +{
> +     __emit_bb_end(w, false, seqnos, 0);
> +}
> +
> +static void
> +prepare_workload(struct workload *wrk, bool swap_vcs, bool seqnos)
> +{
> +     int max_ctx = -1;
> +     struct w_step *w;
> +     int i;
> +
> +     if (seqnos) {
> +             const unsigned int status_sz = sizeof(uint32_t);
> +
> +             for (i = 0; i < NUM_ENGINES; i++) {
> +                     wrk->status_page_handle[i] = gem_create(fd, status_sz);


Need to set_cache_level(CACHED) for llc.

You can use one page for all engines. Just use a different cacheline
for each, for safety.

> +                     wrk->status_page[i] =
> +                             gem_mmap__cpu(fd, wrk->status_page_handle[i],
> +                                           0, status_sz, PROT_READ);
> +             }
> +     }
> +
> +     for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +             if ((int)w->context > max_ctx) {
> +                     int delta = w->context + 1 - wrk->nr_ctxs;
> +
> +                     wrk->nr_ctxs += delta;
> +                     wrk->ctx_id = realloc(wrk->ctx_id,
> +                                           wrk->nr_ctxs * sizeof(uint32_t));
> +                     memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0,
> +                            delta * sizeof(uint32_t));
> +
> +                     max_ctx = w->context;
> +             }
> +
> +             if (!wrk->ctx_id[w->context]) {
> +                     struct drm_i915_gem_context_create arg = {};
> +
> +                     drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
> +                     igt_assert(arg.ctx_id);
> +
> +                     wrk->ctx_id[w->context] = arg.ctx_id;
> +             }
> +     }
> +
> +     for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +             enum intel_engine_id engine = w->engine;
> +             unsigned int bb_i, j = 0;
> +
> +             if (w->type != BATCH)
> +                     continue;
> +
> +             w->obj[j].handle = gem_create(fd, 4096);
> +             w->obj[j].flags = EXEC_OBJECT_WRITE;
> +             j++;
> +
> +             if (seqnos) {
> +                     w->obj[j].handle = wrk->status_page_handle[engine];
> +                     w->obj[j].flags = EXEC_OBJECT_WRITE;

The trick for sharing between engines is to not mark this as a WRITE.
Fun little lies.

> +                     j++;
> +             }
> +
> +             bb_i = j++;
> +             w->duration.cur = w->duration.max;
> +             w->bb_sz = get_bb_sz(&w->duration);
> +             w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz);
> +             terminate_bb(w, seqnos, 0);
> +             if (seqnos) {
> +                     w->reloc.presumed_offset = -1;
> +                     w->reloc.target_handle = 1;
> +                     w->reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION;
> +                     w->reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION;

Ugh. That's a magic w/a value for pipecontrols. Fortunately we don't want
to set write_domain here anyway.

> +             }
> +
> +             igt_assert(w->dependency <= 0);
> +             if (w->dependency) {
> +                     int dep_idx = i + w->dependency;
> +
> +                     igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
> +                     igt_assert(wrk->steps[dep_idx].type == BATCH);
> +
> +                     w->obj[j].handle = w->obj[bb_i].handle;
> +                     bb_i = j;
> +                     w->obj[j - 1].handle =
> +                                     wrk->steps[dep_idx].obj[0].handle;
> +                     j++;
> +             }
> +
> +             if (seqnos) {
> +                     w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc);
> +                     w->obj[bb_i].relocation_count = 1;
> +             }
> +
> +             w->eb.buffers_ptr = to_user_pointer(w->obj);
> +             w->eb.buffer_count = j;
> +             w->eb.rsvd1 = wrk->ctx_id[w->context];
> +
> +             if (swap_vcs && engine == VCS1)
> +                     engine = VCS2;
> +             else if (swap_vcs && engine == VCS2)
> +                     engine = VCS1;
> +             w->eb.flags = eb_engine_map[engine];
> +             w->eb.flags |= I915_EXEC_HANDLE_LUT;
> +             if (!seqnos)
> +                     w->eb.flags |= I915_EXEC_NO_RELOC;

Doesn't look too hard to get the relocation right. Forcing relocations
between batches is probably a good one to check (just to say don't do
that)

> +#ifdef DEBUG
> +             printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] 
> ctx[%u]=%u\n",
> +                    i, w->eb.buffer_count, w->obj[0].handle,
> +                    w->obj[1].handle, w->obj[2].handle, w->obj[3].handle,
> +                    w->bb_sz, w->eb.flags, w->bb_handle, bb_i,
> +                    w->context, wrk->ctx_id[w->context]);
> +#endif
> +     }
> +}
> +
> +static double elapsed(const struct timespec *start, const struct timespec 
> *end)
> +{
> +     return (end->tv_sec - start->tv_sec) +
> +            (end->tv_nsec - start->tv_nsec) / 1e9;
> +}
> +
> +static int elapsed_us(const struct timespec *start, const struct timespec 
> *end)
> +{

return 1e6 * elapsed(); might as well use gcc for something!

> +     return (1e9 * (end->tv_sec - start->tv_sec) +
> +            (end->tv_nsec - start->tv_nsec)) / 1e3;
> +}
> +
> +static enum intel_engine_id
> +rr_balance(struct workload *wrk, struct w_step *w)
> +{
> +     unsigned int engine;
> +
> +     if (wrk->vcs_rr)
> +             engine = VCS2;
> +     else
> +             engine = VCS1;
> +
> +     wrk->vcs_rr ^= 1;
> +
> +     return engine;
> +}
> +
> +static enum intel_engine_id
> +qd_balance(struct workload *wrk, struct w_step *w)
> +{
> +     unsigned long qd[NUM_ENGINES];
> +     enum intel_engine_id engine = w->engine;
> +
> +     igt_assert(engine == VCS);
> +
> +     qd[VCS1] = wrk->seqno[VCS1] - wrk->status_page[VCS1][0];
> +     wrk->qd_sum[VCS1] += qd[VCS1];
> +
> +     qd[VCS2] = wrk->seqno[VCS2] - wrk->status_page[VCS2][0];
> +     wrk->qd_sum[VCS2] += qd[VCS2];
> +
> +     if (qd[VCS1] < qd[VCS2]) {
> +             engine = VCS1;
> +             wrk->vcs_rr = 0;
> +     } else if (qd[VCS2] < qd[VCS1]) {
> +             engine = VCS2;
> +             wrk->vcs_rr = 1;
> +     } else {
> +             unsigned int vcs = wrk->vcs_rr ^ 1;
> +
> +             wrk->vcs_rr = vcs;
> +
> +             if (vcs == 0)
> +                     engine = VCS1;
> +             else
> +                     engine = VCS2;
> +     }

Hmm. Just thinking we don't even need hw to simulate a load-balancer,
but that would be boring!

> +// printf("qd_balance: 1:%lu 2:%lu rr:%u = %u\n", qd[VCS1], qd[VCS2], 
> wrk->vcs_rr, engine);
> +
> +     return engine;
> +}
> +
> +static void update_bb_seqno(struct w_step *w, uint32_t seqno)
> +{
> +     unsigned long mmap_start, mmap_offset, mmap_len;
> +     void *ptr;
> +
> +     mmap_start = rounddown(w->seqno_offset, PAGE_SIZE);
> +     mmap_offset = w->seqno_offset - mmap_start;
> +     mmap_len = sizeof(uint32_t) + mmap_offset;
> +
> +     gem_set_domain(fd, w->bb_handle,
> +                    I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> +
> +     ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
> +
> +     *(uint32_t *)((char *)ptr + mmap_offset) = seqno;

Uh oh. I hope this isn't called inside any loop. Note this is
unsynchronized to the gpu so I wonder what this is for.

> +
> +     munmap(ptr, mmap_len);
> +}
> +
> +static void
> +run_workload(unsigned int id, struct workload *wrk, unsigned int repeat,
> +          enum intel_engine_id (*balance)(struct workload *wrk,
> +                                          struct w_step *w), bool seqnos)
> +{
> +     struct timespec t_start, t_end;
> +     struct w_step *w;
> +     double t;
> +     int i, j;
> +
> +     clock_gettime(CLOCK_MONOTONIC, &t_start);
> +
> +     srand(t_start.tv_nsec);
> +
> +     for (j = 0; j < repeat; j++) {
> +             for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +                     enum intel_engine_id engine = w->engine;
> +                     uint32_t seqno;
> +                     bool seqno_updated = false;
> +                     int do_sleep = 0;
> +
> +                     if (i == 0)
> +                             clock_gettime(CLOCK_MONOTONIC,
> +                                           &wrk->repeat_start);
> +
> +                     if (w->type == DELAY) {
> +                             do_sleep = w->wait;
> +                     } else if (w->type == PERIOD) {
> +                             struct timespec now;
> +
> +                             clock_gettime(CLOCK_MONOTONIC, &now);
> +                             do_sleep = w->wait -
> +                                        elapsed_us(&wrk->repeat_start, &now);
> +                             if (do_sleep < 0) {
> +                                     if (!quiet) {
> +                                             printf("%u: Dropped period @ 
> %u/%u (%dus late)!\n",
> +                                                    id, j, i, do_sleep);
> +                                             continue;
> +                                     }
> +                             }
> +                     } else if (w->type == SYNC) {
> +                             unsigned int s_idx = i + w->wait;
> +
> +                             igt_assert(i > 0 && i < wrk->nr_steps);
> +                             igt_assert(wrk->steps[s_idx].type == BATCH);
> +                             gem_sync(fd, wrk->steps[s_idx].obj[0].handle);
> +                             continue;
> +                     }
> +
> +                     if (do_sleep) {
> +                             usleep(do_sleep);
> +                             continue;
> +                     }
> +
> +                     wrk->nr_bb[engine]++;
> +
> +                     if (engine == VCS && balance) {
> +                             engine = balance(wrk, w);
> +                             wrk->nr_bb[engine]++;
> +
> +                             w->obj[1].handle = 
> wrk->status_page_handle[engine];
> +
> +                             w->eb.flags = eb_engine_map[engine];
> +                             w->eb.flags |= I915_EXEC_HANDLE_LUT;
> +                     }
> +
> +                     seqno = ++wrk->seqno[engine];
> +
> +                     if (w->duration.min != w->duration.max) {
> +                             unsigned int cur = get_duration(&w->duration);
> +
> +                             if (cur != w->duration.cur) {
> +                                     unterminate_bb(w, seqnos);

Ah, you said this was for adjusting runlength of the batches. I suggest
using batch_start_offset to change the number of nops rather than
rewrite the batch.

I need to study this a bit more...
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH i-g-t v3] benchmarks/gem_wsim: Command submission workload simulator

Reply via email to