After applying the diffs you sent on 2021-05-17 and 2021-05-27, I
booted the new kernel which completed until the login prompt. On
the way I got this:
...
scsibus2 at vscsi0: 256 targets
softraid0 at root
scsibus3 at softraid0: 256 targets
root on sd0a (5e1040cb2dc494f4.a) swap on sd0b dump on sd0b
i915_ggtt_pin called with NULL vma
WARNING !list_empty(&dev->mode_config.connector_list) failed at
/usr/src/sys/dev/pci/drm/drm_mode_config.c:487
[drm] *ERROR* connector DP-2 leaked!
drm : drm_WARN_ON(d->wake_count)drm : drm_WARN_ON(d->wake_count)Device
initialization failed (-22)
WARNING ({ typeof(vblank->enabled) __tmp = *(volatile typeof(vblank->enabled)
*)&(vblank->enabled); membar_datadep_consumer(); __tmp; }) &&
drm_core_check_feature(dev, DRIVER_MODESET) failed at
/usr/src/sys/dev/pci/drm/drm_vblank.c:440
Automatic boot in progress: starting file system checks.
/dev/sd0a (5e1040cb2dc494f4.a): file system is clean; not checking
...
Then I rebooted a few times without problems. Then, this happened:
...
scsibus2 at vscsi0: 256 targets
softraid0 at root
scsibus3 at softraid0: 256 targets
root on sd0a (5e1040cb2dc494f4.a) swap on sd0b dump on sd0b
uvm_fault(0xffffffff8218aa20, 0xb9, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at i915_ggtt_pin+0x31: movq 0xb8(%rdi),%r12
ddb{0}> trace
i915_ggtt_pin(1,10000,20) at i915_ggtt_pin+0x31
gen6_ppgtt_pin(ffff80000080f000) at gen6_ppgtt_pin+0x7c
__intel_context_do_pin(fffffd817adb6d80) at __intel_context_do_pin+0xca
intel_engines_init(ffff800000104c38) at intel_engines_init+0x4b5
intel_gt_init(ffff800000104c38) at intel_gt_init+0x130
i915_gem_init(ffff800000100000) at i915_gem_init+0xa3
i915_driver_probe(ffff800000100000,ffffffff8207c330) at i915_driver_probe+0x7ed
inteldrm_attachhook(ffff800000100000) at inteldrm_attachhook+0x43
config_process_deferred_mountroot() at config_process_deferred_mountroot+0x6b
main(0) at main+0x733
end trace frame: 0x0, count: -10
ddb{0}> mach ddbcpu 1
Stopped at x86_ipi_db+0x12: leave
ddb{1}> trace
x86_ipi_db(ffff80001ff39ff0) at x86_ipi_db+0x12
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x23
pagezero() at pagezero+0x1d
end trace frame: 0x0, count: -4
ddb{1}> mach ddbcpu 2
Stopped at x86_ipi_db+0x12: leave
ddb{2}> trace
x86_ipi_db(ffff80001ff42ff0) at x86_ipi_db+0x12
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x23
acpicpu_idle() at acpicpu_idle+0x1ea
sched_idle(ffff80001ff42ff0) at sched_idle+0x27e
end trace frame: 0x0, count: -5
ddb{2}> mach ddbcpu 3
Stopped at x86_ipi_db+0x12: leave
ddb{3}> trace
x86_ipi_db(ffff80001ff4bff0) at x86_ipi_db+0x12
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x23
acpicpu_idle() at acpicpu_idle+0x1ea
sched_idle(ffff80001ff4bff0) at sched_idle+0x27e
end trace frame: 0x0, count: -5
ddb{3}> mach ddbcpu 0
Stopped at i915_ggtt_pin+0x31: movq 0xb8(%rdi),%r12
ddb{0}> ps
PID TID PPID UID S FLAGS WAIT COMMAND
96492 458495 0 0 3 0x14200 bored i915-userptr-acq
47107 142704 0 0 3 0x14200 bored i915_flip
78707 150564 0 0 3 0x14200 bored i915_modeset
65142 186494 0 0 3 0x14200 bored i915-dp
40747 202062 0 0 3 0x14200 bored i915
95885 520940 0 0 3 0x14200 bored smr
53549 455821 0 0 7 0x14200 zerothread
9763 62961 0 0 3 0x14200 aiodoned aiodoned
16393 46637 0 0 2 0x14600 update
96165 57862 0 0 3 0x14200 cleaner cleaner
50668 400747 0 0 3 0x14200 reaper reaper
22827 65470 0 0 3 0x14200 pgdaemon pagedaemon
74791 354628 0 0 3 0x14200 bored crynlk
74563 29562 0 0 3 0x14200 bored crypto
3420 208609 0 0 3 0x14200 usbtsk usbtask
52920 137514 0 0 3 0x14200 usbatsk usbatsk
81094 143259 0 0 3 0x14200 bored drmtskl
67467 377341 0 0 3 0x14200 bored drmlwq
42508 320137 0 0 3 0x14200 bored drmlwq
56022 226224 0 0 3 0x14200 bored drmlwq
67137 287154 0 0 3 0x14200 bored drmlwq
94635 448729 0 0 3 0x14200 bored drmubwq
92127 359644 0 0 3 0x14200 bored drmubwq
88871 457027 0 0 3 0x14200 bored drmubwq
59563 193255 0 0 3 0x14200 bored drmubwq
95387 118220 0 0 3 0x14200 bored drmhpwq
42179 204898 0 0 3 0x14200 bored drmhpwq
33442 342703 0 0 3 0x14200 bored drmhpwq
35835 31321 0 0 3 0x14200 bored drmhpwq
93138 477536 0 0 3 0x14200 bored drmwq
94032 411421 0 0 3 0x14200 bored drmwq
99840 206532 0 0 3 0x14200 bored drmwq
98846 375842 0 0 3 0x14200 bored drmwq
48774 506278 0 0 2 0x40014200 acpi0
70133 127243 0 0 7 0x40014200 idle3
77559 343912 0 0 7 0x40014200 idle2
92051 177112 0 0 1 0x14200 idle1
98224 121451 0 0 2 0x14200 sensors
38426 419300 0 0 3 0x14200 bored softnet
81391 515783 0 0 3 0x14200 bored systqmp
12371 202579 0 0 3 0x14200 bored systq
74071 217005 0 0 2 0x40014200 softclock
15684 18336 0 0 3 0x40014200 idle0
1 231868 0 0 3 0 initexec swapper
* 0 0 -1 0 7 0x10200 swapper
ddb{0}> show uvm
Current UVM status:
pagesize=4096 (0x1000), pagemask=0xfff, pageshift=12
971040 VM pages: 0 active, 0 inactive, 20 wired, 963031 free (19081 zero)
min 10% (25) anon, 10% (25) vnode, 5% (12) vtext
freemin=32368, free-target=43157, inactive-target=0, wired-max=323680
faults=1, traps=3, intrs=8601, ctxswitch=168 fpuswitch=0
softint=2869, syscalls=0, kmapent=12
fault counts:
noram=0, noanon=0, noamap=0, pgwait=0, pgrele=0
ok relocks(total)=0(0), anget(retries)=0(0), amapcopy=0
neighbor anon/obj pg=0/0, gets(lock/unlock)=0/0
cases: anon=0, anoncow=0, obj=0, prcopy=0, przero=0
daemon and swap counts:
woke=0, revs=0, scans=0, obscans=0, anscans=0
busy=0, freed=0, reactivate=0, deactivate=0
pageouts=0, pending=0, nswget=0
nswapdev=1
swpages=1068660, swpginuse=0, swpgonly=0 paging=0
kernel pointers:
objs(kern)=0xffffffff82160b58
ddb{0}> show bcstats
Current Buffer Cache status:
numbufs 2 busymapped 0, delwri 0
kvaslots 5964 avail kva slots 5964
bufpages 5, dmapages 5, dirtypages 0
pendingreads 0, pendingwrites 0
highflips 0, highflops 0, dmaflips 0
ddb{0}>
On the next reboot, the system booted up with this:
...
scsibus2 at vscsi0: 256 targets
softraid0 at root
scsibus3 at softraid0: 256 targets
root on sd0a (5e1040cb2dc494f4.a) swap on sd0b dump on sd0b
i915_ggtt_pin called with NULL vma
WARNING !list_empty(&dev->mode_config.connector_list) failed at
/usr/src/sys/dev/pci/drm/drm_mode_config.c:487
[drm] *ERROR* connector DP-2 leaked!
drm : drm_WARN_ON(d->wake_count)drm : drm_WARN_ON(d->wake_count)Device
initialization failed (-22)
WARNING ({ typeof(vblank->enabled) __tmp = *(volatile typeof(vblank->enabled)
*)&(vblank->enabled); membar_datadep_consumer(); __tmp; }) &&
drm_core_check_feature(dev, DRIVER_MODESET) failed at
/usr/src/sys/dev/pci/drm/drm_vblank.c:440
Automatic boot in progress: starting file system checks.
/dev/sd0a (5e1040cb2dc494f4.a): file system is clean; not checking
...
Next reboot did not complete. The boot process stopped here:
...
scsibus2 at vscsi0: 256 targets
softraid0 at root
scsibus3 at softraid0: 256 targets
root on sd0a (5e1040cb2dc494f4.a) swap on sd0b dump on sd0b
I hope this helps to narrow down the issue.
On 2021-05-27 08:41, Jonathan Gray wrote:
> On Mon, May 17, 2021 at 05:35:02PM +1000, Jonathan Gray wrote:
>> On Tue, May 04, 2021 at 03:44:54PM +0200, [email protected] wrote:
>>> Thanks for the quick help. I built a kernel with your fix.
>>> The system is booting up with a warning, now.
>>>
>>> ...
>>> scsibus3 at softraid0: 256 targets
>>> sd2 at scsibus3 targ 1 lun 0: <OPENBSD, SR RAID 1, 006>
>>> sd2: 122103MB, 512 bytes/sector, 250067198 sectors
>>> root on sd2a (da12fadb67cf7a4d.a) swap on sd2b dump on sd2b
>>> drm : drm_WARN_ON(d->wake_count)drm : drm_WARN_ON(d->wake_count)Device
>>> initialization failed (-22)
>>> Automatic boot in progress: starting file system checks.
>>> /dev/sd2a (da12fadb67cf7a4d.a): file system is clean; not checking
>>> ...
>>
>> Thanks, can you try this?
>
> And this diff with commits backported to -current related to vma/pinning.
>
> drm/i915/gt: Prevent use of engine->wa_ctx after error
> drm/i915: Fix mismatch between misplaced vma check and vma insert
> drm/i915: Hold onto an explicit ref to i915_vma_work.pinned
> drm/i915: Use the active reference on the vma while capturing
>
> diff --git sys/dev/pci/drm/i915/gem/i915_gem_execbuffer.c
> sys/dev/pci/drm/i915/gem/i915_gem_execbuffer.c
> index 971ed84f371..993c2b22f9f 100644
> --- sys/dev/pci/drm/i915/gem/i915_gem_execbuffer.c
> +++ sys/dev/pci/drm/i915/gem/i915_gem_execbuffer.c
> @@ -365,7 +365,7 @@ eb_vma_misplaced(const struct drm_i915_gem_exec_object2
> *entry,
> return true;
>
> if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) &&
> - (vma->node.start + vma->node.size - 1) >> 32)
> + (vma->node.start + vma->node.size + 4095) >> 32)
> return true;
>
> if (flags & __EXEC_OBJECT_NEEDS_MAP &&
> diff --git sys/dev/pci/drm/i915/gt/intel_lrc.c
> sys/dev/pci/drm/i915/gt/intel_lrc.c
> index ac8eade748b..9bdb964d14f 100644
> --- sys/dev/pci/drm/i915/gt/intel_lrc.c
> +++ sys/dev/pci/drm/i915/gt/intel_lrc.c
> @@ -3462,6 +3462,9 @@ err:
> static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
> {
> i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
> +
> + /* Called on error unwind, clear all flags to prevent further use */
> + memset(&engine->wa_ctx, 0, sizeof(engine->wa_ctx));
> }
>
> typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
> diff --git sys/dev/pci/drm/i915/i915_gpu_error.c
> sys/dev/pci/drm/i915/i915_gpu_error.c
> index 9d02829f8df..72e25f3d014 100644
> --- sys/dev/pci/drm/i915/i915_gpu_error.c
> +++ sys/dev/pci/drm/i915/i915_gpu_error.c
> @@ -1346,7 +1346,7 @@ capture_vma(struct intel_engine_capture_vma *next,
> }
>
> strlcpy(c->name, name, sizeof(c->name));
> - c->vma = i915_vma_get(vma);
> + c->vma = vma; /* reference held while active */
>
> c->next = next;
> return c;
> @@ -1456,7 +1456,6 @@ intel_engine_coredump_add_vma(struct
> intel_engine_coredump *ee,
> compress));
>
> i915_active_release(&vma->active);
> - i915_vma_put(vma);
>
> capture = this->next;
> kfree(this);
> diff --git sys/dev/pci/drm/i915/i915_vma.c sys/dev/pci/drm/i915/i915_vma.c
> index 2bf2292ae31..8aca774266c 100644
> --- sys/dev/pci/drm/i915/i915_vma.c
> +++ sys/dev/pci/drm/i915/i915_vma.c
> @@ -331,8 +331,10 @@ static void __vma_release(struct dma_fence_work *work)
> {
> struct i915_vma_work *vw = container_of(work, typeof(*vw), base);
>
> - if (vw->pinned)
> + if (vw->pinned) {
> __i915_gem_object_unpin_pages(vw->pinned);
> + i915_gem_object_put(vw->pinned);
> + }
> }
>
> static const struct dma_fence_work_ops bind_ops = {
> @@ -448,7 +450,7 @@ int i915_vma_bind(struct i915_vma *vma,
>
> if (vma->obj) {
> __i915_gem_object_pin_pages(vma->obj);
> - work->pinned = vma->obj;
> + work->pinned = i915_gem_object_get(vma->obj);
> }
> } else {
> ret = vma->ops->bind_vma(vma, cache_level, bind_flags);
>