tyr: add GPU reset handling

Onur Özkan Sun, 15 Mar 2026 03:44:56 -0700

On Fri, 13 Mar 2026 11:56:58 -0300
Daniel Almeida <[email protected]> wrote:


> 
> 
> > On 13 Mar 2026, at 06:16, Onur Özkan <[email protected]> wrote:
> > 
> > Move Tyr reset logic into a new reset module and add async reset
> > work.
> > 
> > This adds:
> > - ResetHandle with internal controller state
> > - a dedicated ordered reset workqueue
> > - a pending flag to avoid duplicate queued resets
> > - run_reset() as the shared synchronous reset helper
> > 
> > Probe now calls reset::run_reset() before normal init. Driver data
> > now keeps ResetHandle so reset work is drained before clocks and
> > regulators are dropped.
> > 
> > Tested-by: Deborah Brouwer <[email protected]>
> > Signed-off-by: Onur Özkan <[email protected]>
> > ---
> > drivers/gpu/drm/tyr/driver.rs |  40 +++-----
> > drivers/gpu/drm/tyr/reset.rs  | 180
> > ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/tyr/tyr.rs    |
> >  1 + 3 files changed, 192 insertions(+), 29 deletions(-)
> > create mode 100644 drivers/gpu/drm/tyr/reset.rs
> > 
> > diff --git a/drivers/gpu/drm/tyr/driver.rs
> > b/drivers/gpu/drm/tyr/driver.rs index f7951804e4e0..c80238a21ff2
> > 100644 --- a/drivers/gpu/drm/tyr/driver.rs
> > +++ b/drivers/gpu/drm/tyr/driver.rs
> > @@ -6,11 +6,8 @@
> >         OptionalClk, //
> >     },
> >     device::{
> > -        Bound,
> > -        Core,
> > -        Device, //
> > +        Core, //
> >     },
> > -    devres::Devres,
> >     dma::{
> >         Device as DmaDevice,
> >         DmaMask, //
> > @@ -22,10 +19,7 @@
> >         Registered,
> >         UnregisteredDevice, //
> >     },
> > -    io::poll,
> > -    new_mutex,
> > -    of,
> > -    platform,
> > +    new_mutex, of, platform,
> >     prelude::*,
> >     regulator,
> >     regulator::Regulator,
> > @@ -35,17 +29,15 @@
> >         Arc,
> >         Mutex, //
> >     },
> > -    time, //
> > };
> > 
> > use crate::{
> >     file::TyrDrmFileData,
> >     fw::Firmware,
> >     gem::BoData,
> > -    gpu,
> >     gpu::GpuInfo,
> >     mmu::Mmu,
> > -    regs, //
> > +    reset, //
> > };
> > 
> > pub(crate) type IoMem = kernel::io::mem::IoMem<SZ_2M>;
> > @@ -62,6 +54,11 @@ pub(crate) struct TyrPlatformDriverData {
> > 
> > #[pin_data]
> > pub(crate) struct TyrDrmDeviceData {
> > +    // `ResetHandle::drop()` drains queued/running works and this
> > must happen
> > +    // before clocks/regulators are dropped. So keep this field
> > before them to
> > +    // ensure the correct drop order.
> > +    pub(crate) reset: reset::ResetHandle,
> > +
> >     pub(crate) pdev: ARef<platform::Device>,
> > 
> >     pub(crate) fw: Arc<Firmware>,
> > @@ -90,22 +87,6 @@ unsafe impl Send for TyrDrmDeviceData {}
> > // SAFETY: This will be removed in a future patch.
> > unsafe impl Sync for TyrDrmDeviceData {}
> > 
> > -fn issue_soft_reset(dev: &Device<Bound>, iomem: &Devres<IoMem>) ->
> > Result {
> > -    // Clear any stale reset-complete IRQ state before issuing a
> > new soft reset.
> > -    regs::GPU_IRQ_CLEAR.write(dev, iomem,
> > regs::GPU_IRQ_RAWSTAT_RESET_COMPLETED)?;
> > -    regs::GPU_CMD.write(dev, iomem, regs::GPU_CMD_SOFT_RESET)?;
> > -
> > -    poll::read_poll_timeout(
> > -        || regs::GPU_IRQ_RAWSTAT.read(dev, iomem),
> > -        |status| *status & regs::GPU_IRQ_RAWSTAT_RESET_COMPLETED
> > != 0,
> > -        time::Delta::from_millis(1),
> > -        time::Delta::from_millis(100),
> > -    )
> > -    .inspect_err(|_| dev_err!(dev, "GPU reset failed."))?;
> > -
> > -    Ok(())
> > -}
> > -
> > kernel::of_device_table!(
> >     OF_TABLE,
> >     MODULE_OF_TABLE,
> > @@ -138,8 +119,7 @@ fn probe(
> >         let request = pdev.io_request_by_index(0).ok_or(ENODEV)?;
> >         let iomem = Arc::pin_init(request.iomap_sized::<SZ_2M>(),
> > GFP_KERNEL)?;
> > 
> > -        issue_soft_reset(pdev.as_ref(), &iomem)?;
> > -        gpu::l2_power_on(pdev.as_ref(), &iomem)?;
> > +        reset::run_reset(pdev.as_ref(), &iomem)?;
> > 
> >         let gpu_info = GpuInfo::new(pdev.as_ref(), &iomem)?;
> >         gpu_info.log(pdev);
> > @@ -153,6 +133,7 @@ fn probe(
> > 
> >         let uninit_ddev =
> > UnregisteredDevice::<TyrDrmDriver>::new(pdev.as_ref())?; let
> > platform: ARef<platform::Device> = pdev.into();
> > +        let reset = reset::ResetHandle::new(platform.clone(),
> > iomem.clone())?;
> > 
> >         let mmu = Mmu::new(pdev, iomem.as_arc_borrow(), &gpu_info)?;
> > 
> > @@ -178,6 +159,7 @@ fn probe(
> >                     _mali: mali_regulator,
> >                     _sram: sram_regulator,
> >                 }),
> > +                reset,
> >                 gpu_info,
> >         });
> >         let ddev = Registration::new_foreign_owned(uninit_ddev,
> > pdev.as_ref(), data, 0)?; diff --git a/drivers/gpu/drm/tyr/reset.rs
> > b/drivers/gpu/drm/tyr/reset.rs new file mode 100644
> > index 000000000000..29dfae98b0dd
> > --- /dev/null
> > +++ b/drivers/gpu/drm/tyr/reset.rs
> > @@ -0,0 +1,180 @@
> > +// SPDX-License-Identifier: GPL-2.0 or MIT
> > +
> > +//! Provides asynchronous reset handling for the Tyr DRM driver via
> > +//! [`ResetHandle`], which runs reset work on a dedicated ordered
> > +//! workqueue and avoids duplicate pending resets.
> > +
> > +use kernel::{
> > +    device::{
> > +        Bound,
> > +        Device, //
> > +    },
> > +    devres::Devres,
> > +    io::poll,
> > +    platform,
> > +    prelude::*,
> > +    sync::{
> > +        aref::ARef,
> > +        atomic::{
> > +            Acquire,
> > +            Atomic,
> > +            Relaxed,
> > +            Release, //
> > +        },
> > +        Arc,
> > +    },
> > +    time,
> > +    workqueue::{
> > +        self,
> > +        Work, //
> > +    },
> > +};
> > +
> > +use crate::{
> > +    driver::IoMem,
> > +    gpu,
> > +    regs, //
> > +};
> > +
> > +/// Manages asynchronous GPU reset handling and ensures only a
> > single reset +/// work is pending at a time.
> > +#[pin_data]
> > +struct Controller {
> > +    /// Platform device reference needed for reset operations and
> > logging.
> > +    pdev: ARef<platform::Device>,
> > +    /// Mapped register space needed for reset operations.
> > +    iomem: Arc<Devres<IoMem>>,
> > +    /// Atomic flag for controlling the scheduling pending state.
> > +    pending: Atomic<bool>,
> > +    /// Dedicated ordered workqueue for reset operations.
> > +    wq: workqueue::OrderedQueue,
> > +    /// Work item backing async reset processing.
> > +    #[pin]
> > +    work: Work<Controller>,
> > +}
> > +
> > +kernel::impl_has_work! {
> > +    impl HasWork<Controller> for Controller { self.work }
> > +}
> > +
> > +impl workqueue::WorkItem for Controller {
> > +    type Pointer = Arc<Self>;
> > +
> > +    fn run(this: Arc<Self>) {
> > +        this.reset_work();
> > +    }
> > +}
> > +
> > +impl Controller {
> > +    /// Creates a [`Controller`] instance.
> > +    fn new(pdev: ARef<platform::Device>, iomem:
> > Arc<Devres<IoMem>>) -> Result<Arc<Self>> {
> > +        let wq = workqueue::OrderedQueue::new(c"tyr-reset-wq", 0)?;
> > +
> > +        Arc::pin_init(
> > +            try_pin_init!(Self {
> > +                pdev,
> > +                iomem,
> > +                pending: Atomic::new(false),
> > +                wq,
> > +                work <- kernel::new_work!("tyr::reset"),
> > +            }),
> > +            GFP_KERNEL,
> > +        )
> > +    }
> > +
> > +    /// Processes one scheduled reset request.
> > +    ///
> > +    /// Panthor reference:
> > +    /// -
> > drivers/gpu/drm/panthor/panthor_device.c::panthor_device_reset_work()
> > +    fn reset_work(self: &Arc<Self>) {
> > +        dev_info!(self.pdev.as_ref(), "GPU reset work is
> > started.\n"); +
> > +        // SAFETY: `Controller` is part of driver-private data and
> > only exists
> > +        // while the platform device is bound.
> > +        let pdev = unsafe { self.pdev.as_ref().as_bound() };
> > +        if let Err(e) = run_reset(pdev, &self.iomem) {
> > +            dev_err!(self.pdev.as_ref(), "GPU reset failed:
> > {:?}\n", e);
> > +        } else {
> > +            dev_info!(self.pdev.as_ref(), "GPU reset work is
> > done.\n");
> > +        }
> 
> Can we have more descriptive strings here? A user cares little for
> implementation details such as “reset work”, what they care about is
> that the hardware is undergoing a reset.
> 

Sure, I will update it.

> > +
> > +        self.pending.store(false, Release);
> > +    }
> > +}
> > +
> > +/// Reset handle that shuts down pending work gracefully on drop.
> > +pub(crate) struct ResetHandle(Arc<Controller>);
> > +
> 
> Why is this an Arc? There seems to be a single owner?
> 

Once we queue reset work, the workqueue needs its own ref so Controller
stays alive until the worker runs and returns. ResetHandle keeps the
normal ref and the queued work holds the extra one.

Regards,
Onur

> > +impl ResetHandle {
> > +    /// Creates a [`ResetHandle`] instance.
> > +    pub(crate) fn new(pdev: ARef<platform::Device>, iomem:
> > Arc<Devres<IoMem>>) -> Result<Self> {
> > +        Ok(Self(Controller::new(pdev, iomem)?))
> > +    }
> > +
> > +    /// Schedules reset work.
> > +    #[expect(dead_code)]
> > +    pub(crate) fn schedule(&self) {
> > +        // TODO: Similar to `panthor_device_schedule_reset()` in
> > Panthor, add a
> > +        // power management check once Tyr supports it.
> > +
> > +        // Keep only one reset request running or queued. If one
> > is already pending,
> > +        // we ignore new schedule requests.
> > +        if self.0.pending.cmpxchg(false, true, Relaxed).is_ok()
> > +            && self.0.wq.enqueue(self.0.clone()).is_err()
> > +        {
> > +            self.0.pending.store(false, Release);
> > +        }
> > +    }
> > +
> > +    /// Returns true if a reset is queued or in progress.
> > +    ///
> > +    /// Note that the state can change immediately after the
> > return.
> > +    #[inline]
> > +    #[expect(dead_code)]
> > +    pub(crate) fn is_pending(&self) -> bool {
> > +        self.0.pending.load(Acquire)
> > +    }
> 
> > +}
> > +
> > +impl Drop for ResetHandle {
> > +    fn drop(&mut self) {
> > +        // Drain queued/running work and block future queueing
> > attempts for this
> > +        // work item before clocks/regulators are torn down.
> > +        // SAFETY: drop executes in a sleepable context.
> > +        unsafe { self.0.work.disable_sync() };
> > +    }
> > +}
> > +
> > +/// Issues a soft reset command and waits for reset-complete IRQ
> > status. +fn issue_soft_reset(dev: &Device<Bound>, iomem:
> > &Devres<IoMem>) -> Result {
> > +    // Clear any stale reset-complete IRQ state before issuing a
> > new soft reset.
> > +    regs::GPU_IRQ_CLEAR.write(dev, iomem,
> > regs::GPU_IRQ_RAWSTAT_RESET_COMPLETED)?;
> > +    regs::GPU_CMD.write(dev, iomem, regs::GPU_CMD_SOFT_RESET)?;
> > +
> > +    poll::read_poll_timeout(
> > +        || regs::GPU_IRQ_RAWSTAT.read(dev, iomem),
> > +        |status| *status & regs::GPU_IRQ_RAWSTAT_RESET_COMPLETED
> > != 0,
> > +        time::Delta::from_millis(1),
> > +        time::Delta::from_millis(100),
> > +    )
> > +    .inspect_err(|_| dev_err!(dev, "GPU reset failed."))?;
> > +
> > +    Ok(())
> > +}
> > +
> > +/// Runs one synchronous GPU reset pass.
> > +///
> > +/// Its visibility is `pub(super)` only so the probe path can run
> > an +/// initial reset; it is not part of this module's public API.
> > +///
> > +/// On success, the GPU is left in a state suitable for
> > reinitialization. +///
> > +/// The reset sequence is as follows:
> > +/// 1. Trigger a GPU soft reset.
> > +/// 2. Wait for the reset-complete IRQ status.
> > +/// 3. Power L2 back on.
> > +pub(super) fn run_reset(dev: &Device<Bound>, iomem:
> > &Devres<IoMem>) -> Result {
> > +    issue_soft_reset(dev, iomem)?;
> > +    gpu::l2_power_on(dev, iomem)?;
> > +    Ok(())
> > +}
> > diff --git a/drivers/gpu/drm/tyr/tyr.rs b/drivers/gpu/drm/tyr/tyr.rs
> > index 18b0668bb217..d0349bc49f27 100644
> > --- a/drivers/gpu/drm/tyr/tyr.rs
> > +++ b/drivers/gpu/drm/tyr/tyr.rs
> > @@ -14,6 +14,7 @@
> > mod gpu;
> > mod mmu;
> > mod regs;
> > +mod reset;
> > mod slot;
> > mod vm;
> > 
> > -- 
> > 2.51.2
> > 
> 
>

Re: [PATCH v1 RESEND 4/4] drm/tyr: add GPU reset handling

Reply via email to