Re: [Intel-gfx] [igt-dev] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness

Mika Kuoppala Tue, 02 Jun 2020 02:22:08 -0700

Chris Wilson <[email protected]> writes:

> An important property for multi-client systems is that each client gets
> a 'fair' allotment of system time. (Where fairness is at the whim of the
> context properties, such as priorities.) This test forks N independent
> clients (albeit they happen to share a single vm), and does an equal
> amount of work in client and asserts that they take an equal amount of
> time.
>
> Though we have never claimed to have a completely fair scheduler, that
> is what is expected.
>
> Signed-off-by: Chris Wilson <[email protected]>
> Cc: Tvrtko Ursulin <[email protected]>
> Cc: Ramalingam C <[email protected]>
> ---
>  tests/i915/gem_exec_schedule.c | 418 +++++++++++++++++++++++++++++++++
>  1 file changed, 418 insertions(+)
>
> diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
> index 56c638833..d1121ecd2 100644
> --- a/tests/i915/gem_exec_schedule.c
> +++ b/tests/i915/gem_exec_schedule.c
> @@ -2495,6 +2495,417 @@ static void measure_semaphore_power(int i915)
>       rapl_close(&pkg);
>  }
>  
> +static int read_timestamp_frequency(int i915)
> +{
> +     int value = 0;
> +     drm_i915_getparam_t gp = {
> +             .value = &value,
> +             .param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
> +     };
> +     ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
> +     return value;
> +}
> +
> +static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
> +{
> +     return (x + y - 1) / y;
> +}
> +
> +static uint64_t ns_to_ticks(int i915, uint64_t ns)
> +{
> +     return div64_u64_round_up(ns * read_timestamp_frequency(i915),
> +                               NSEC_PER_SEC);
> +}
> +
> +static uint64_t ticks_to_ns(int i915, uint64_t ticks)
> +{
> +     return div64_u64_round_up(ticks * NSEC_PER_SEC,
> +                               read_timestamp_frequency(i915));
> +}
> +
> +#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
> +
> +#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
> +#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | 
> (op2))
> +/* Opcodes for MI_MATH_INSTR */
> +#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
> +#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
> +#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
> +#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
> +#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
> +#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
> +#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
> +#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
> +#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
> +#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
> +#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
> +#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
> +/* Registers used as operands in MI_MATH_INSTR */
> +#define   MI_MATH_REG(x)                (x)
> +#define   MI_MATH_REG_SRCA              0x20
> +#define   MI_MATH_REG_SRCB              0x21
> +#define   MI_MATH_REG_ACCU              0x31
> +#define   MI_MATH_REG_ZF                0x32
> +#define   MI_MATH_REG_CF                0x33


Are you thinking that we should just pull in the driver gpu_commands.h
as is into lib?

-Mika

> +
> +#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
> +
> +static void delay(int i915,
> +               const struct intel_execution_engine2 *e,
> +               uint32_t handle,
> +               uint64_t addr,
> +               uint64_t ns)
> +{
> +     const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +     const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +#define CS_GPR(x) (base + 0x600 + 8 * (x))
> +#define TIMESTAMP (base + 0x3a8)
> +     enum { START_TS, NOW_TS };
> +     uint32_t *map, *cs, *jmp;
> +
> +     igt_require(base);
> +
> +     cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
> +
> +     *cs++ = MI_LOAD_REGISTER_IMM;
> +     *cs++ = CS_GPR(START_TS) + 4;
> +     *cs++ = 0;
> +     *cs++ = MI_LOAD_REGISTER_REG;
> +     *cs++ = TIMESTAMP;
> +     *cs++ = CS_GPR(START_TS);
> +
> +     if (offset_in_page(cs) & 4)
> +             *cs++ = 0;
> +     jmp = cs;
> +
> +     *cs++ = 0x5 << 23; /* MI_ARB_CHECK */
> +
> +     *cs++ = MI_LOAD_REGISTER_IMM;
> +     *cs++ = CS_GPR(NOW_TS) + 4;
> +     *cs++ = 0;
> +     *cs++ = MI_LOAD_REGISTER_REG;
> +     *cs++ = TIMESTAMP;
> +     *cs++ = CS_GPR(NOW_TS);
> +
> +     *cs++ = MI_MATH(4);
> +     *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
> +     *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
> +     *cs++ = MI_MATH_SUB;
> +     *cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
> +
> +     *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +     *cs++ = CS_GPR(NOW_TS);
> +     *cs++ = addr + 4000;
> +     *cs++ = addr >> 32;
> +
> +     *cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
> +     *cs++ = ~ns_to_ticks(i915, ns);
> +     *cs++ = addr + 4000;
> +     *cs++ = addr >> 32;
> +
> +     *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
> +     *cs++ = addr + offset_in_page(jmp);
> +     *cs++ = addr >> 32;
> +
> +     munmap(map, 4096);
> +}
> +
> +static struct drm_i915_gem_exec_object2
> +delay_create(int i915, uint32_t ctx,
> +          const struct intel_execution_engine2 *e,
> +          uint64_t target_ns)
> +{
> +     struct drm_i915_gem_exec_object2 obj = {
> +             .handle = batch_create(i915),
> +             .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
> +     };
> +     struct drm_i915_gem_execbuffer2 execbuf = {
> +             .buffers_ptr = to_user_pointer(&obj),
> +             .buffer_count = 1,
> +             .rsvd1 = ctx,
> +             .flags = e->flags,
> +     };
> +
> +     gem_execbuf(i915, &execbuf);
> +     gem_sync(i915, obj.handle);
> +
> +     delay(i915, e, obj.handle, obj.offset, target_ns);
> +
> +     obj.flags |= EXEC_OBJECT_PINNED;
> +     return obj;
> +}
> +
> +static void tslog(int i915,
> +               const struct intel_execution_engine2 *e,
> +               uint32_t handle,
> +               uint64_t addr)
> +{
> +     const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +     const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +#define CS_GPR(x) (base + 0x600 + 8 * (x))
> +#define CS_TIMESTAMP (base + 0x358)
> +     enum { ONE, MASK, ADDR };
> +     uint32_t *timestamp_lo, *addr_lo;
> +     uint32_t *map, *cs;
> +
> +     igt_require(base);
> +
> +     map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
> +     cs = map + 512;
> +
> +     *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +     *cs++ = CS_TIMESTAMP;
> +     timestamp_lo = cs;
> +     *cs++ = addr;
> +     *cs++ = addr >> 32;
> +
> +     *cs++ = MI_LOAD_REGISTER_IMM;
> +     *cs++ = CS_GPR(ADDR);
> +     addr_lo = cs;
> +     *cs++ = addr;
> +     *cs++ = MI_LOAD_REGISTER_IMM;
> +     *cs++ = CS_GPR(ADDR) + 4;
> +     *cs++ = addr >> 32;
> +
> +     *cs++ = MI_LOAD_REGISTER_IMM;
> +     *cs++ = CS_GPR(ONE);
> +     *cs++ = 4;
> +     *cs++ = MI_LOAD_REGISTER_IMM;
> +     *cs++ = CS_GPR(ONE) + 4;
> +     *cs++ = 0;
> +
> +     *cs++ = MI_LOAD_REGISTER_IMM;
> +     *cs++ = CS_GPR(MASK);
> +     *cs++ = 0xfffff7ff;
> +     *cs++ = MI_LOAD_REGISTER_IMM;
> +     *cs++ = CS_GPR(MASK) + 4;
> +     *cs++ = 0xffffffff;
> +
> +     *cs++ = MI_MATH(8);
> +     *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ONE));
> +     *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
> +     *cs++ = MI_MATH_ADD;
> +     *cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
> +     *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
> +     *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
> +     *cs++ = MI_MATH_AND;
> +     *cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
> +
> +     *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +     *cs++ = CS_GPR(ADDR);
> +     *cs++ = addr + offset_in_page(timestamp_lo);
> +     *cs++ = addr >> 32;
> +     *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +     *cs++ = CS_GPR(ADDR);
> +     *cs++ = addr + offset_in_page(addr_lo);
> +     *cs++ = addr >> 32;
> +
> +     *cs++ = MI_BATCH_BUFFER_END;
> +
> +     munmap(map, 4096);
> +}
> +
> +static struct drm_i915_gem_exec_object2
> +tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
> +{
> +     struct drm_i915_gem_exec_object2 obj = {
> +             .handle = batch_create(i915),
> +             .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
> +     };
> +     struct drm_i915_gem_execbuffer2 execbuf = {
> +             .buffers_ptr = to_user_pointer(&obj),
> +             .buffer_count = 1,
> +             .rsvd1 = ctx,
> +             .flags = e->flags,
> +     };
> +
> +     gem_execbuf(i915, &execbuf);
> +     gem_sync(i915, obj.handle);
> +
> +     tslog(i915, e, obj.handle, obj.offset);
> +
> +     obj.flags |= EXEC_OBJECT_PINNED;
> +     return obj;
> +}
> +
> +static int cmp_u32(const void *A, const void *B)
> +{
> +     const unsigned long *a = A, *b = B;
> +
> +     if (*a < *b)
> +             return -1;
> +     else if (*a > *b)
> +             return 1;
> +     else
> +             return 0;
> +}
> +
> +static void fair_child(int i915, uint32_t ctx,
> +                    const struct intel_execution_engine2 *e,
> +                    uint64_t frame_ns,
> +                    int timeout,
> +                    int timeline,
> +                    unsigned int flags,
> +                    unsigned long *ctl,
> +                    unsigned long *out)
> +#define F_PACING 0x1
> +#define F_EXTERNAL 0x2
> +{
> +     const int batches_per_frame = 3;
> +     struct drm_i915_gem_exec_object2 prev =
> +             delay_create(i915, ctx, e, frame_ns / batches_per_frame);
> +     struct drm_i915_gem_exec_object2 next =
> +             delay_create(i915, ctx, e, frame_ns / batches_per_frame);
> +     struct drm_i915_gem_exec_object2 ts = tslog_create(i915, ctx, e);
> +     struct timespec tv = {};
> +     unsigned long count = 0;
> +     int p_fence = -1, n_fence = -1;
> +     uint32_t *map;
> +     int n;
> +
> +     igt_nsec_elapsed(&tv);
> +     while (!READ_ONCE(*ctl)) {
> +             struct drm_i915_gem_execbuffer2 execbuf = {
> +                     .buffers_ptr = to_user_pointer(&next),
> +                     .buffer_count = 1,
> +                     .rsvd1 = ctx,
> +                     .rsvd2 = -1,
> +                     .flags = e->flags,
> +             };
> +
> +             if (flags & F_EXTERNAL) {
> +                     execbuf.rsvd2 =
> +                             sw_sync_timeline_create_fence(timeline, count);
> +                     execbuf.flags |= I915_EXEC_FENCE_IN;
> +             }
> +
> +             execbuf.flags |= I915_EXEC_FENCE_OUT;
> +             gem_execbuf_wr(i915, &execbuf);
> +             n_fence = execbuf.rsvd2 >> 32;
> +             execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
> +             for (n = 1; n < batches_per_frame; n++)
> +                     gem_execbuf(i915, &execbuf);
> +
> +             execbuf.buffers_ptr = to_user_pointer(&ts);
> +             execbuf.batch_start_offset = 2048;
> +             gem_execbuf(i915, &execbuf);
> +
> +             if (flags & F_PACING && p_fence != -1) {
> +                     struct pollfd pfd = {
> +                             .fd = p_fence,
> +                             .events = POLLIN,
> +                     };
> +                     poll(&pfd, 1, -1);
> +             }
> +             close(p_fence);
> +             close(execbuf.rsvd2);
> +
> +             igt_swap(prev, next);
> +             igt_swap(p_fence, n_fence);
> +             count++;
> +     }
> +     gem_sync(i915, prev.handle);
> +     close(p_fence);
> +
> +     gem_close(i915, next.handle);
> +     gem_close(i915, prev.handle);
> +
> +     map = gem_mmap__device_coherent(i915, ts.handle, 0, 4096, PROT_WRITE);
> +     for (n = 1; n < min(count, 512); n++)
> +             map[n - 1] = map[n] - map[n - 1];
> +     qsort(map, --n, sizeof(*map), cmp_u32);
> +     *out = ticks_to_ns(i915, map[n / 2]);
> +     munmap(map, 4096);
> +
> +     gem_close(i915, ts.handle);
> +}
> +
> +static int cmp_ul(const void *A, const void *B)
> +{
> +     const unsigned long *a = A, *b = B;
> +
> +     if (*a < *b)
> +             return -1;
> +     else if (*a > *b)
> +             return 1;
> +     else
> +             return 0;
> +}
> +
> +static void fairness(int i915,
> +                  const struct intel_execution_engine2 *e,
> +                  int timeout, unsigned int flags)
> +{
> +     const int frame_ns = 16666 * 1000;
> +     unsigned long *result;
> +
> +     igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
> +     igt_require(gem_class_has_mutable_submission(i915, e->class));
> +
> +     result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
> +
> +     for (int n = 2; n <= 16; n <<= 1) {
> +             int timeline = sw_sync_timeline_create();
> +             int nframes = timeout * NSEC_PER_SEC / frame_ns + 1;
> +             const int nchild = n - 1; /* odd for easy medians */
> +             const int lo = nchild / 4;
> +             const int hi = (3 * nchild + 3) / 4 - 1;
> +             struct igt_mean m;
> +
> +             memset(result, 0, (nchild + 1) * sizeof(result[0]));
> +             igt_fork(child, nchild) {
> +                     uint32_t ctx = gem_context_clone_with_engines(i915, 0);
> +
> +                     fair_child(i915, ctx, e, frame_ns / nchild,
> +                                timeout, timeline, flags,
> +                                &result[nchild],
> +                                &result[child]);
> +
> +                     gem_context_destroy(i915, ctx);
> +             }
> +
> +             while (nframes--) {
> +                     struct timespec tv = { .tv_nsec = frame_ns };
> +                     nanosleep(&tv, NULL);
> +                     sw_sync_timeline_inc(timeline, 1);
> +             }
> +             result[nchild] = 1;
> +             for (int child = 0; child < nchild; child++) {
> +                     while (!READ_ONCE(result[child])) {
> +                             struct timespec tv = { .tv_nsec = frame_ns };
> +                             nanosleep(&tv, NULL);
> +                             sw_sync_timeline_inc(timeline, 1);
> +                     }
> +             }
> +             igt_waitchildren();
> +             close(timeline);
> +
> +             igt_mean_init(&m);
> +             for (int child = 0; child < nchild; child++)
> +                     igt_mean_add(&m, result[child]);
> +
> +             qsort(result, nchild, sizeof(*result), cmp_ul);
> +             igt_info("%d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], 
> median: %.1f, mean: %.1f ± %.2f ms\n",
> +                      nchild,
> +                      1e-6 * result[0],  1e-6 * result[nchild - 1],
> +                      1e-6 * result[lo], 1e-6 * result[hi],
> +                      1e-6 * result[nchild / 2],
> +                      1e-6 * igt_mean_get(&m),
> +                      1e-6 * sqrt(igt_mean_get_variance(&m)));
> +
> +#if 0
> +             /* Mean within 10% of target */
> +             igt_assert( 9 * igt_mean_get(&m) > 10 * frame_ns &&
> +                        10 * igt_mean_get(&m) <  9 * frame_ns);
> +
> +             /* Variance [inter-quartile range] is less than 33% of median */
> +             igt_assert(3 * result[hi] - result[lo] < result[nchild / 2]);
> +#endif
> +     }
> +
> +     munmap(result, 4096);
> +}
> +
>  #define test_each_engine(T, i915, e) \
>       igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
>               igt_dynamic_f("%s", e->name)
> @@ -2589,6 +3000,13 @@ igt_main
>               test_each_engine_store("promotion", fd, e)
>                       promotion(fd, e->flags);
>  
> +             test_each_engine_store("fair-none", fd, e)
> +                     fairness(fd, e, 2, 0);
> +             test_each_engine_store("fair-pace", fd, e)
> +                     fairness(fd, e, 2, F_PACING);
> +             test_each_engine_store("fair-sync", fd, e)
> +                     fairness(fd, e, 2, F_PACING | F_EXTERNAL);
> +
>               igt_subtest_group {
>                       igt_fixture {
>                               igt_require(gem_scheduler_has_preemption(fd));
> -- 
> 2.27.0.rc2
>
> _______________________________________________
> igt-dev mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [igt-dev] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness

Reply via email to