Ok, here you go ...
In approx. 4 out of 10 times the boot process does not conclude.
First variant:
...
scsibus2 at vscsi0: 256 targets
softraid0 at root
scsibus3 at softraid0: 256 targets
root on sd0a (5e1040cb2dc494f4.a) swap on sd0b dump on sd0b
gen6 pad1 0 0
pad1 4095 ffff8000
pad2 0 81c7a0
pad2 4095 0
kernel: protection fault trap, code=0
Stopped at i915_vma_pin+0x167: testl %r15d,0x160(%rax)
ddb{1}> trace
i915_vma_pin(ffff80000081c780,0,10000,420) at i915_vma_pin+0x167
i915_ggtt_pin(ffff80000081c780,10000,20) at i915_ggtt_pin+0x91
gen6_ppgtt_pin(ffff80000080f000) at gen6_ppgtt_pin+0xd7
__intel_context_do_pin(fffffd817e14ed80) at __intel_context_do_pin+0xca
intel_engines_init(ffff800000104c38) at intel_engines_init+0x4b5
intel_gt_init(ffff800000104c38) at intel_gt_init+0x130
i915_gem_init(ffff800000100000) at i915_gem_init+0xa3
i915_driver_probe(ffff800000100000,ffffffff81f14df0) at i915_driver_probe+0x7ed
inteldrm_attachhook(ffff800000100000) at inteldrm_attachhook+0x43
config_process_deferred_mountroot() at config_process_deferred_mountroot+0x6b
main(0) at main+0x733
end trace frame: 0x0, count: -11
ddb{1}> mach ddbcpu 0
Stopped at x86_ipi_db+0x12: leave
ddb{0}> trace
x86_ipi_db(ffffffff820f7ff0) at x86_ipi_db+0x12
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x23
_kernel_lock() at _kernel_lock+0xb2
softintr_dispatch(0) at softintr_dispatch+0x4a
Xsoftclock() at Xsoftclock+0x1f
acpicpu_idle() at acpicpu_idle+0x1ea
sched_idle(ffffffff820f7ff0) at sched_idle+0x27e
end trace frame: 0x0, count: -8
ddb{0}> mach ddb cpu 2
Symbol not found
ddb{0}> mach ddbcpu 2
Stopped at x86_ipi_db+0x12: leave
ddb{2}> trace
x86_ipi_db(ffff80001ff42ff0) at x86_ipi_db+0x12
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x23
acpicpu_idle() at acpicpu_idle+0x1ea
sched_idle(ffff80001ff42ff0) at sched_idle+0x27e
end trace frame: 0x0, count: -5
ddb{2}> mach ddbcpu 3
Stopped at x86_ipi_db+0x12: leave
ddb{3}> trace
x86_ipi_db(ffff80001ff4bff0) at x86_ipi_db+0x12
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x23
mtx_enter_try(ffffffff822418a8) at mtx_enter_try+0x84
mtx_enter(ffffffff822418a8) at mtx_enter+0x35
uvm_pagezero_thread(ffff800020259a40) at uvm_pagezero_thread+0x74
end trace frame: 0x0, count: -6
ddb{3}> mach ddbcpu 1
Stopped at i915_vma_pin+0x167: testl %r15d,0x160(%rax)
ddb{1}> ps
PID TID PPID UID S FLAGS WAIT COMMAND
57276 406435 0 0 3 0x14200 bored i915-userptr-acq
18828 410208 0 0 3 0x14200 bored i915_flip
72532 503959 0 0 3 0x14200 bored i915_modeset
26696 414393 0 0 3 0x14200 bored i915-dp
76284 351576 0 0 3 0x14200 bored i915
27501 406247 0 0 3 0x14200 bored smr
24358 292417 0 0 7 0x14200 zerothread
58546 370476 0 0 3 0x14200 aiodoned aiodoned
69702 385016 0 0 3 0x14200 syncer update
33778 379219 0 0 3 0x14200 cleaner cleaner
29467 427921 0 0 3 0x14200 reaper reaper
71585 17515 0 0 3 0x14200 pgdaemon pagedaemon
38422 37499 0 0 3 0x14200 bored crynlk
96968 404247 0 0 3 0x14200 bored crypto
85767 451266 0 0 3 0x14200 usbtsk usbtask
64966 267267 0 0 3 0x14200 usbatsk usbatsk
56027 154673 0 0 3 0x14200 bored drmtskl
69848 454575 0 0 3 0x14200 bored drmlwq
90459 232289 0 0 3 0x14200 bored drmlwq
7935 270871 0 0 3 0x14200 bored drmlwq
34417 347812 0 0 3 0x14200 bored drmlwq
40915 390021 0 0 3 0x14200 bored drmubwq
35151 389374 0 0 3 0x14200 bored drmubwq
57449 443683 0 0 3 0x14200 bored drmubwq
71227 313856 0 0 3 0x14200 bored drmubwq
49029 510183 0 0 3 0x14200 bored drmhpwq
74799 162896 0 0 3 0x14200 bored drmhpwq
1459 354055 0 0 3 0x14200 bored drmhpwq
64384 453576 0 0 3 0x14200 bored drmhpwq
30147 158249 0 0 3 0x14200 bored drmwq
31840 434648 0 0 3 0x14200 bored drmwq
48853 206487 0 0 3 0x14200 bored drmwq
45954 355302 0 0 3 0x14200 bored drmwq
63591 457879 0 0 3 0x40014200 acpi0 acpi0
34899 137527 0 0 1 0x14200 idle3
42295 187703 0 0 7 0x40014200 idle2
5356 154045 0 0 3 0x40014200 idle1
60193 384404 0 0 3 0x14200 bored sensors
59943 96119 0 0 3 0x14200 bored softnet
94208 343230 0 0 3 0x14200 bored systqmp
75880 413625 0 0 3 0x14200 bored systq
32713 128466 0 0 3 0x40014200 bored softclock
16101 240213 0 0 7 0x40014200 idle0
1 275798 0 0 3 0 initexec swapper
* 0 0 -1 0 7 0x10200 swapper
ddb{1}> show uvm
Current UVM status:
pagesize=4096 (0x1000), pagemask=0xfff, pageshift=12
971041 VM pages: 0 active, 0 inactive, 20 wired, 963032 free (7664 zero)
min 10% (25) anon, 10% (25) vnode, 5% (12) vtext
freemin=32368, free-target=43157, inactive-target=0, wired-max=323680
faults=0, traps=3, intrs=6440, ctxswitch=166 fpuswitch=0
softint=137, syscalls=0, kmapent=12
fault counts:
noram=0, noanon=0, noamap=0, pgwait=0, pgrele=0
ok relocks(total)=0(0), anget(retries)=0(0), amapcopy=0
neighbor anon/obj pg=0/0, gets(lock/unlock)=0/0
cases: anon=0, anoncow=0, obj=0, prcopy=0, przero=0
daemon and swap counts:
woke=0, revs=0, scans=0, obscans=0, anscans=0
busy=0, freed=0, reactivate=0, deactivate=0
pageouts=0, pending=0, nswget=0
nswapdev=1
swpages=1068660, swpginuse=0, swpgonly=0 paging=0
kernel pointers:
objs(kern)=0xffffffff821ac878
ddb{1}> show bcstats
Current Buffer Cache status:
numbufs 2 busymapped 0, delwri 0
kvaslots 5964 avail kva slots 5964
bufpages 5, dmapages 5, dirtypages 0
pendingreads 0, pendingwrites 0
highflips 0, highflops 0, dmaflips 0
ddb{1}>
Second variant:
...
scsibus2 at vscsi0: 256 targets
softraid0 at root
scsibus3 at softraid0: 256 targets
root on sd0a (5e1040cb2dc494f4.a) swap on sd0b dump on sd0b
gen6 pad1 0 0
pad1 4095 ffff8000
pad2 0 8187a0
pad2 4095 0
kernel: protection fault trap, code=0
Stopped at i915_vma_pin+0x167: testl %r15d,0x160(%rax)
ddb{1}> trace
i915_vma_pin(ffff800000818780,0,10000,420) at i915_vma_pin+0x167
i915_ggtt_pin(ffff800000818780,10000,20) at i915_ggtt_pin+0x91
gen6_ppgtt_pin(ffff80000080e000) at gen6_ppgtt_pin+0xd7
__intel_context_do_pin(fffffd817e160d80) at __intel_context_do_pin+0xca
intel_engines_init(ffff800000104c38) at intel_engines_init+0x4b5
intel_gt_init(ffff800000104c38) at intel_gt_init+0x130
i915_gem_init(ffff800000100000) at i915_gem_init+0xa3
i915_driver_probe(ffff800000100000,ffffffff81fbff40) at i915_driver_probe+0x7ed
inteldrm_attachhook(ffff800000100000) at inteldrm_attachhook+0x43
config_process_deferred_mountroot() at config_process_deferred_mountroot+0x6b
main(0) at main+0x733
end trace frame: 0x0, count: -11
ddb{1}> mach ddbcpu 0
Stopped at x86_ipi_db+0x12: leave
ddb{0}> trace
x86_ipi_db(ffffffff820f3ff0) at x86_ipi_db+0x12
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x23
_kernel_lock() at _kernel_lock+0xb2
softintr_dispatch(0) at softintr_dispatch+0x4a
Xsoftclock() at Xsoftclock+0x1f
acpicpu_idle() at acpicpu_idle+0x1ea
sched_idle(ffffffff820f3ff0) at sched_idle+0x27e
end trace frame: 0x0, count: -8
ddb{0}> mach ddbcpu 2
Stopped at x86_ipi_db+0x12: leave
ddb{2}> trace
x86_ipi_db(ffff80001ff42ff0) at x86_ipi_db+0x12
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x23
pagezero() at pagezero+0x29
end trace frame: 0x0, count: -4
ddb{2}> mach ddbcpu 3
Stopped at x86_ipi_db+0x12: leave
ddb{3}> trace
x86_ipi_db(ffff80001ff4bff0) at x86_ipi_db+0x12
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x23
acpicpu_idle() at acpicpu_idle+0x1ea
sched_idle(ffff80001ff4bff0) at sched_idle+0x27e
end trace frame: 0x0, count: -5
ddb{3}> mach ddbcpu 1
Stopped at i915_vma_pin+0x167: testl %r15d,0x160(%rax)
ddb{1}> ps
PID TID PPID UID S FLAGS WAIT COMMAND
57947 393667 0 0 3 0x14200 bored i915-userptr-acq
981 256282 0 0 3 0x14200 bored i915_flip
72134 168056 0 0 3 0x14200 bored i915_modeset
23700 230975 0 0 3 0x14200 bored i915-dp
21629 383627 0 0 3 0x14200 bored i915
21841 362736 0 0 3 0x14200 bored smr
95539 39484 0 0 7 0x14200 zerothread
33137 71718 0 0 3 0x14200 aiodoned aiodoned
34771 298806 0 0 3 0x14200 syncer update
70634 43467 0 0 3 0x14200 cleaner cleaner
59404 46135 0 0 3 0x14200 reaper reaper
41881 479280 0 0 3 0x14200 pgdaemon pagedaemon
4574 523029 0 0 3 0x14200 bored crynlk
12450 482595 0 0 3 0x14200 bored crypto
42426 256415 0 0 3 0x14200 usbtsk usbtask
93641 233033 0 0 3 0x14200 usbatsk usbatsk
77876 353886 0 0 3 0x14200 bored drmtskl
55832 245118 0 0 3 0x14200 bored drmlwq
79725 176992 0 0 3 0x14200 bored drmlwq
86580 323296 0 0 3 0x14200 bored drmlwq
12480 7554 0 0 3 0x14200 bored drmlwq
72464 509745 0 0 3 0x14200 bored drmubwq
20859 82297 0 0 3 0x14200 bored drmubwq
43431 60053 0 0 3 0x14200 bored drmubwq
69904 167702 0 0 3 0x14200 bored drmubwq
17767 438272 0 0 3 0x14200 bored drmhpwq
16087 458102 0 0 3 0x14200 bored drmhpwq
64274 437063 0 0 3 0x14200 bored drmhpwq
63524 468676 0 0 3 0x14200 bored drmhpwq
10231 61781 0 0 3 0x14200 bored drmwq
74996 96653 0 0 3 0x14200 bored drmwq
9420 373331 0 0 3 0x14200 bored drmwq
95965 442364 0 0 3 0x14200 bored drmwq
69608 20515 0 0 3 0x40014200 acpi0 acpi0
68858 32798 0 0 7 0x40014200 idle3
94100 274306 0 0 1 0x14200 idle2
30778 109612 0 0 3 0x40014200 idle1
45549 278157 0 0 3 0x14200 bored sensors
14889 481387 0 0 3 0x14200 bored softnet
51916 229545 0 0 3 0x14200 bored systqmp
39161 389488 0 0 3 0x14200 bored systq
98123 330029 0 0 3 0x40014200 bored softclock
18964 9085 0 0 7 0x40014200 idle0
1 492901 0 0 3 0 initexec swapper
* 0 0 -1 0 7 0x10200 swapper
ddb{1}> show uvm
Current UVM status:
pagesize=4096 (0x1000), pagemask=0xfff, pageshift=12
971039 VM pages: 0 active, 0 inactive, 20 wired, 963031 free (7664 zero)
min 10% (25) anon, 10% (25) vnode, 5% (12) vtext
freemin=32367, free-target=43156, inactive-target=0, wired-max=323679
faults=0, traps=3, intrs=44, ctxswitch=166 fpuswitch=0
softint=137, syscalls=0, kmapent=13
fault counts:
noram=0, noanon=0, noamap=0, pgwait=0, pgrele=0
ok relocks(total)=0(0), anget(retries)=0(0), amapcopy=0
neighbor anon/obj pg=0/0, gets(lock/unlock)=0/0
cases: anon=0, anoncow=0, obj=0, prcopy=0, przero=0
daemon and swap counts:
woke=0, revs=0, scans=0, obscans=0, anscans=0
busy=0, freed=0, reactivate=0, deactivate=0
pageouts=0, pending=0, nswget=0
nswapdev=1
swpages=1068660, swpginuse=0, swpgonly=0 paging=0
kernel pointers:
objs(kern)=0xffffffff82168f20
ddb{1}> show bcstats
Current Buffer Cache status:
numbufs 2 busymapped 0, delwri 0
kvaslots 5964 avail kva slots 5964
bufpages 5, dmapages 5, dirtypages 0
pendingreads 0, pendingwrites 0
highflips 0, highflops 0, dmaflips 0
ddb{1}>
Or the boot process stops like this:
...
scsibus2 at vscsi0: 256 targets
softraid0 at root
scsibus3 at softraid0: 256 targets
root on sd0a (5e1040cb2dc494f4.a) swap on sd0b dump on sd0b
or like this:
...
scsibus2 at vscsi0: 256 targets
softraid0 at root
scsibus3 at softraid0: 256 targets
root on sd0a (5e1040cb2dc494f4.a) swap on sd0b dump on sd0b
panic: rw_enter: (null) locking aga
Btw, I frequently also get this "command error":
...
wsdisplay0 at inteldrm0 mux 1
pckbd_enable: command error
wsdisplay0: screen 0-5 added (std, vt100 emulation)
...
On 2021-06-01 16:15, Jonathan Gray wrote:
> On Mon, May 31, 2021 at 02:38:34PM +0200, Matthias Pressfreund wrote:
>> After applying the diffs you sent on 2021-05-17 and 2021-05-27, I
>> booted the new kernel which completed until the login prompt. On
>> the way I got this:
>>
>> ...
>> scsibus2 at vscsi0: 256 targets
>> softraid0 at root
>> scsibus3 at softraid0: 256 targets
>> root on sd0a (5e1040cb2dc494f4.a) swap on sd0b dump on sd0b
>>
>> i915_ggtt_pin called with NULL vma
>> WARNING !list_empty(&dev->mode_config.connector_list) failed at
>> /usr/src/sys/dev/pci/drm/drm_mode_config.c:487
>> [drm] *ERROR* connector DP-2 leaked!
>> drm : drm_WARN_ON(d->wake_count)drm : drm_WARN_ON(d->wake_count)Device
>> initialization failed (-22)
>> WARNING ({ typeof(vblank->enabled) __tmp = *(volatile
>> typeof(vblank->enabled) *)&(vblank->enabled); membar_datadep_consumer();
>> __tmp; }) && drm_core_check_feature(dev, DRIVER_MODESET) failed at
>> /usr/src/sys/dev/pci/drm/drm_vblank.c:440
>> Automatic boot in progress: starting file system checks.
>> /dev/sd0a (5e1040cb2dc494f4.a): file system is clean; not checking
>> ...
>>
>>
>> Then I rebooted a few times without problems. Then, this happened:
>>
>> ...
>> scsibus2 at vscsi0: 256 targets
>> softraid0 at root
>> scsibus3 at softraid0: 256 targets
>> root on sd0a (5e1040cb2dc494f4.a) swap on sd0b dump on sd0b
>> uvm_fault(0xffffffff8218aa20, 0xb9, 0, 1) -> e
>> kernel: page fault trap, code=0
>> Stopped at i915_ggtt_pin+0x31: movq 0xb8(%rdi),%r12
>> ddb{0}> trace
>> i915_ggtt_pin(1,10000,20) at i915_ggtt_pin+0x31
>> gen6_ppgtt_pin(ffff80000080f000) at gen6_ppgtt_pin+0x7c
>> __intel_context_do_pin(fffffd817adb6d80) at __intel_context_do_pin+0xca
>> intel_engines_init(ffff800000104c38) at intel_engines_init+0x4b5
>> intel_gt_init(ffff800000104c38) at intel_gt_init+0x130
>> i915_gem_init(ffff800000100000) at i915_gem_init+0xa3
>> i915_driver_probe(ffff800000100000,ffffffff8207c330) at
>> i915_driver_probe+0x7ed
>>
>> inteldrm_attachhook(ffff800000100000) at inteldrm_attachhook+0x43
>> config_process_deferred_mountroot() at config_process_deferred_mountroot+0x6b
>
> I believe something is wrongly writing over the struct with the vma.
> Can you drop the previous patches and try this?
>
> Index: sys/dev/pci/drm/i915/i915_vma.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/drm/i915/i915_vma.c,v
> retrieving revision 1.5
> diff -u -p -r1.5 i915_vma.c
> --- sys/dev/pci/drm/i915/i915_vma.c 11 Oct 2020 05:45:33 -0000 1.5
> +++ sys/dev/pci/drm/i915/i915_vma.c 1 Jun 2021 13:16:37 -0000
> @@ -484,7 +484,8 @@ void __iomem *i915_vma_pin_iomap(struct
> err = agp_map_subregion(dev_priv->agph, vma->node.start,
> vma->node.size, &vma->bsh);
> if (err) {
> - err = -err;
> + printf("%s agp_map_subregion failed\n", __func__);
> + err = -ENOMEM;
> goto err;
> }
> ptr = bus_space_vaddr(dev_priv->bst, vma->bsh);
> @@ -851,6 +852,13 @@ static int vma_get_pages(struct i915_vma
> if (mutex_lock_interruptible(&vma->pages_mutex))
> return -EINTR;
>
> + if (((vaddr_t)vma->obj) < VM_MIN_KERNEL_ADDRESS) {
> + printf("%s bad obj ptr %p\n", __func__, vma->obj);
> + vma->obj = NULL;
> + err = -EINVAL;
> + goto unlock;
> + }
> +
> if (!atomic_read(&vma->pages_count)) {
> if (vma->obj) {
> err = i915_gem_object_pin_pages(vma->obj);
> @@ -1029,8 +1037,14 @@ static void flush_idle_contexts(struct i
>
> int i915_ggtt_pin(struct i915_vma *vma, u32 align, unsigned int flags)
> {
> - struct i915_address_space *vm = vma->vm;
> + struct i915_address_space *vm;
> int err;
> +
> + if (((vaddr_t)vma) < VM_MIN_KERNEL_ADDRESS) {
> + printf("%s called with %p vma\n", __func__, vma);
> + return -EINVAL;
> + }
> + vm = vma->vm;
>
> GEM_BUG_ON(!i915_vma_is_ggtt(vma));
>
> Index: sys/dev/pci/drm/i915/gt/gen6_ppgtt.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/drm/i915/gt/gen6_ppgtt.c,v
> retrieving revision 1.1
> diff -u -p -r1.1 gen6_ppgtt.c
> --- sys/dev/pci/drm/i915/gt/gen6_ppgtt.c 8 Jun 2020 04:48:13 -0000
> 1.1
> +++ sys/dev/pci/drm/i915/gt/gen6_ppgtt.c 1 Jun 2021 13:18:33 -0000
> @@ -354,8 +354,10 @@ static struct i915_vma *pd_vma_create(st
> GEM_BUG_ON(size > ggtt->vm.total);
>
> vma = i915_vma_alloc();
> - if (!vma)
> + if (!vma) {
> + printf("%s i915_vma_alloc() returned NULL\n", __func__);
> return ERR_PTR(-ENOMEM);
> + }
>
> i915_active_init(&vma->active, NULL, NULL);
>
> @@ -401,8 +403,13 @@ int gen6_ppgtt_pin(struct i915_ppgtt *ba
> * size. We allocate at the top of the GTT to avoid fragmentation.
> */
> err = 0;
> - if (!atomic_read(&ppgtt->pin_count))
> + if (!atomic_read(&ppgtt->pin_count)) {
> + printf("gen6 pad1 0 %x\n", ppgtt->pad1[0]);
> + printf("pad1 4095 %x\n", ppgtt->pad1[4095]);
> + printf("pad2 0 %x\n", ppgtt->pad2[0]);
> + printf("pad2 4095 %x\n", ppgtt->pad2[4095]);
> err = i915_ggtt_pin(ppgtt->vma, GEN6_PD_ALIGN, PIN_HIGH);
> + }
> if (!err)
> atomic_inc(&ppgtt->pin_count);
> mutex_unlock(&ppgtt->pin_mutex);
> Index: sys/dev/pci/drm/i915/gt/gen6_ppgtt.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/drm/i915/gt/gen6_ppgtt.h,v
> retrieving revision 1.1
> diff -u -p -r1.1 gen6_ppgtt.h
> --- sys/dev/pci/drm/i915/gt/gen6_ppgtt.h 8 Jun 2020 04:48:13 -0000
> 1.1
> +++ sys/dev/pci/drm/i915/gt/gen6_ppgtt.h 1 Jun 2021 12:59:09 -0000
> @@ -12,7 +12,9 @@ struct gen6_ppgtt {
> struct i915_ppgtt base;
>
> struct rwlock flush;
> + unsigned int pad1[PAGE_SIZE];
> struct i915_vma *vma;
> + unsigned int pad2[PAGE_SIZE];
> gen6_pte_t __iomem *pd_addr;
>
> atomic_t pin_count;
> Index: sys/dev/pci/drm/i915/gt/intel_ggtt.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/drm/i915/gt/intel_ggtt.c,v
> retrieving revision 1.1
> diff -u -p -r1.1 intel_ggtt.c
> --- sys/dev/pci/drm/i915/gt/intel_ggtt.c 8 Jun 2020 04:48:13 -0000
> 1.1
> +++ sys/dev/pci/drm/i915/gt/intel_ggtt.c 1 Jun 2021 10:44:43 -0000
> @@ -699,8 +699,11 @@ int i915_init_ggtt(struct drm_i915_priva
>
> if (INTEL_PPGTT(i915) == INTEL_PPGTT_ALIASING) {
> ret = init_aliasing_ppgtt(&i915->ggtt);
> - if (ret)
> + if (ret) {
> + printf("\n%s init_aliasing_ppgtt() returned %d\n",
> + __func__, ret);
> cleanup_init_ggtt(&i915->ggtt);
> + }
> }
>
> return 0;
>