On Fri, May 30, 2025 at 10:14:36AM +0900, YASUOKA Masahiko wrote: > On Fri, 30 May 2025 10:08:38 +1000 > Jonathan Gray <j...@jsg.id.au> wrote: > > On Fri, May 30, 2025 at 08:01:31AM +0900, YASUOKA Masahiko wrote: > >> > >> >Synopsis: inteldrm stop working after {hibernate,suspend}/resume > >> >Category: kernel > >> >Environment: > >> System : OpenBSD 7.7 > >> Details : OpenBSD 7.7-current (GENERIC.MP) #117: Thu May 29 > >> 21:18:15 JST 2025 > >> > >> yasuoka@xxx:/home/yasuoka/src/sys/arch/amd64/compile/GENERIC.MP > >> > >> Architecture: OpenBSD.amd64 > >> Machine : amd64 > >> >Description: > >> After hibernate and resume, X11 stops working. Keyboard and > >> mouse don't work, but Ctrl-Alt-F1 or Ctrl-Alt-Backspace works. > >> > >> errors in dmesg: > >> **** > >> drm:pid97650:__uc_init_hw *ERROR* [drm] *ERROR* GT0: GuC initialization > >> failed 0xfffffffffffffffae > >> drm:pid97650:intel_gt_init_hw *ERROR* [drm] *ERROR* GT0: Enabling uc > >> failed (-5) > >> drm:pid97650:intel_gt_resume *ERROR* [drm] *ERROR* GT0: Failed to > >> initialize GPU, declaring it wedged! > >> **** > >> > >> This happens because guc_wait_ucode() in i915/gt/uc/intel_guc_fw.c > >> fails. > >> > >> The function is to wait for the GuC to start up by calling the inline > >> function guc_load_done() and the function checks two regisiters. > >> > >> 97 static inline bool guc_load_done(struct intel_uncore *uncore, > >> u32 *status, bool *success) > >> 98 { > >> 99 u32 val = intel_uncore_read(uncore, GUC_STATUS); > >> 100 u32 uk_val = REG_FIELD_GET(GS_UKERNEL_MASK, val); > >> 101 u32 br_val = REG_FIELD_GET(GS_BOOTROM_MASK, val); > >> 102 > >> 103 *status = val; > >> 104 switch (uk_val) { > >> 105 case INTEL_GUC_LOAD_STATUS_READY: > >> 106 *success = true; > >> 107 return true; > >> 108 > >> 109 case INTEL_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH: > >> 110 case INTEL_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH: > >> 111 case INTEL_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE: > >> > >> In my test, the functions fails with the resgisters: > >> > >> ukernel = INTEL_GUC_LOAD_STATUS_INIT_DATA_INVALID(0x71) > >> bootrom = INTEL_BOOTROM_STATUS_JUMP_PASSED(0x76) > >> > >> When I was using 7.6, I didn't see this problem. > >> > >> >How-To-Repeat: > >> 1. hibernate or suspend > >> 2. resume > >> > >> the problem happens always (~10 times) > >> > >> After the workaround diff, not happen always (~3 times) > >> > >> >Fix: > >> Also the diff attached at last, workaround the problem. > >> > >> The diff partially backouts the change on Feb 7 and add a printf(). > >> > >> I don't understand it logically, but if the printf() is removed, the > >> problem start happening. > > > > Thank you for the report. > > > > Does this smaller diff still workaround the problem? > > The smaller diff doesn't fix the problem. I tried 2 times.
Here is the other part of your initial diff. The non-printf parts are a revert of 'drm/i915/guc: Change wa and EU_PERF_CNTL registers to MCR type' linux 835e4d9bb3a13879031942ca6692d5a82ec00158 It would also be helpful if you could try raise the value of GUC_LOAD_RETRY_LIMIT in intel_guc_fw.c without other patches, to find a value that works. Index: sys/dev/pci/drm/i915/gt/uc/intel_guc_ads.c =================================================================== RCS file: /cvs/src/sys/dev/pci/drm/i915/gt/uc/intel_guc_ads.c,v diff -u -p -r1.9 intel_guc_ads.c --- sys/dev/pci/drm/i915/gt/uc/intel_guc_ads.c 7 Feb 2025 03:03:30 -0000 1.9 +++ sys/dev/pci/drm/i915/gt/uc/intel_guc_ads.c 30 May 2025 05:58:58 -0000 @@ -408,13 +408,8 @@ static int guc_mmio_regset_init(struct t CCS_MASK(engine->gt)) ret |= GUC_MMIO_REG_ADD(gt, regset, GEN12_RCU_MODE, true); - /* - * some of the WA registers are MCR registers. As it is safe to - * use MCR form for non-MCR registers, for code simplicity, all - * WA registers are added with MCR form. - */ for (i = 0, wa = wal->list; i < wal->count; i++, wa++) - ret |= GUC_MCR_REG_ADD(gt, regset, wa->mcr_reg, wa->masked_reg); + ret |= GUC_MMIO_REG_ADD(gt, regset, wa->reg, wa->masked_reg); /* Be extra paranoid and include all whitelist registers. */ for (i = 0; i < RING_MAX_NONPRIV_SLOTS; i++) @@ -430,13 +425,13 @@ static int guc_mmio_regset_init(struct t ret |= GUC_MMIO_REG_ADD(gt, regset, GEN9_LNCFCMOCS(i), false); if (GRAPHICS_VER(engine->i915) >= 12) { - ret |= GUC_MCR_REG_ADD(gt, regset, MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL0)), false); - ret |= GUC_MCR_REG_ADD(gt, regset, MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL1)), false); - ret |= GUC_MCR_REG_ADD(gt, regset, MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL2)), false); - ret |= GUC_MCR_REG_ADD(gt, regset, MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL3)), false); - ret |= GUC_MCR_REG_ADD(gt, regset, MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL4)), false); - ret |= GUC_MCR_REG_ADD(gt, regset, MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL5)), false); - ret |= GUC_MCR_REG_ADD(gt, regset, MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL6)), false); + ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL0, false); + ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL1, false); + ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL2, false); + ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL3, false); + ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL4, false); + ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL5, false); + ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL6, false); } return ret ? -1 : 0; Index: sys/dev/pci/drm/i915/gt/uc/intel_guc_fw.c =================================================================== RCS file: /cvs/src/sys/dev/pci/drm/i915/gt/uc/intel_guc_fw.c,v diff -u -p -r1.8 intel_guc_fw.c --- sys/dev/pci/drm/i915/gt/uc/intel_guc_fw.c 7 Feb 2025 03:03:30 -0000 1.8 +++ sys/dev/pci/drm/i915/gt/uc/intel_guc_fw.c 30 May 2025 05:59:52 -0000 @@ -197,6 +197,7 @@ static int guc_wait_ucode(struct intel_g REG_FIELD_GET(GS_BOOTROM_MASK, status), REG_FIELD_GET(GS_UKERNEL_MASK, status)); } + printf("%s: count = %d, ret = %d\n", __func__, count, ret); after = ktime_get(); delta = ktime_sub(after, before); delta_ms = ktime_to_ms(delta);