Hi Alex, Just did a quick pass, will do a proper review later:
> On Dec 16, 2025, at 12:13 AM, Alexandre Courbot <[email protected]> wrote: > > Currently, the GSP is left running after the driver is unbound. This is > not great for several reasons, notably that it can still access shared > memory areas that the kernel will now reclaim (especially problematic on > setups without an IOMMU). > > Fix this by sending the `UNLOADING_GUEST_DRIVER` GSP command when > unbinding. This stops the GSP and let us proceed with the rest of the > unbind sequence in the next patch. > > Signed-off-by: Alexandre Courbot <[email protected]> > --- > drivers/gpu/nova-core/gpu.rs | 5 +++ > drivers/gpu/nova-core/gsp/boot.rs | 42 +++++++++++++++++++++++ > drivers/gpu/nova-core/gsp/commands.rs | 42 +++++++++++++++++++++++ > drivers/gpu/nova-core/gsp/fw.rs | 4 +++ > drivers/gpu/nova-core/gsp/fw/commands.rs | 27 +++++++++++++++ > drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs | 8 +++++ > 6 files changed, 128 insertions(+) > > diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs > index ef6ceced350e..b94784f57b36 100644 > --- a/drivers/gpu/nova-core/gpu.rs > +++ b/drivers/gpu/nova-core/gpu.rs > @@ -291,6 +291,9 @@ pub(crate) fn new<'a>( > > /// Called when the corresponding [`Device`](device::Device) is unbound. > /// > + /// Prepares the GPU for unbinding by shutting down the GSP and > unregistering the sysmem flush > + /// memory page. > + /// > /// Note: This method must only be called from `Driver::unbind`. > pub(crate) fn unbind(self: Pin<&mut Self>, dev: > &device::Device<device::Core>) { > let this = self.project(); > @@ -299,6 +302,8 @@ pub(crate) fn unbind(self: Pin<&mut Self>, dev: > &device::Device<device::Core>) { > return; > }; > > + let _ = kernel::warn_on_err!(this.gsp.unload(dev, bar, > this.gsp_falcon,)); > + > this.sysmem_flush.unregister(bar); > } > } > diff --git a/drivers/gpu/nova-core/gsp/boot.rs > b/drivers/gpu/nova-core/gsp/boot.rs > index ca7efdaa753b..e12e1d3fd53f 100644 > --- a/drivers/gpu/nova-core/gsp/boot.rs > +++ b/drivers/gpu/nova-core/gsp/boot.rs > @@ -32,6 +32,7 @@ > }, > gpu::Chipset, > gsp::{ > + cmdq::Cmdq, > commands, > sequencer::{ > GspSequencer, > @@ -231,4 +232,45 @@ pub(crate) fn boot( > > Ok(()) > } > + > + /// Shutdowns the GSP and wait until it is offline. > + fn shutdown_gsp( > + cmdq: &mut Cmdq, > + bar: &Bar0, > + gsp_falcon: &Falcon<Gsp>, > + suspend: bool, > + ) -> Result<()> { > + // Send command to shutdown GSP and wait for response. > + commands::unload_guest_driver(cmdq, bar, suspend)?; > + > + // Wait until GSP signals it is suspended. > + const LIBOS_INTERRUPT_PROCESSOR_SUSPENDED: u32 = 0x8000_0000; > + read_poll_timeout( > + || Ok(gsp_falcon.read_mailbox0(bar)), > + |&mb0| mb0 == LIBOS_INTERRUPT_PROCESSOR_SUSPENDED, > + Delta::ZERO, > + Delta::from_secs(5), > + ) > + .map(|_| ()) > + } > + > + /// Attempts to unload the GSP firmware. > + /// > + /// This stops all activity on the GSP. > + pub(crate) fn unload( > + self: Pin<&mut Self>, > + dev: &device::Device<device::Bound>, > + bar: &Bar0, > + gsp_falcon: &Falcon<Gsp>, > + ) -> Result { > + let this = self.project(); > + > + /* Shutdown the GSP */ > + > + Self::shutdown_gsp(this.cmdq, bar, gsp_falcon, false) > + .inspect_err(|e| dev_err!(dev, "unload guest driver failed: > {:?}", e))?; > + dev_dbg!(dev, "GSP shut down\n"); > + > + Ok(()) > + } > } > diff --git a/drivers/gpu/nova-core/gsp/commands.rs > b/drivers/gpu/nova-core/gsp/commands.rs > index 2050771f9b53..0bcfca8c7e42 100644 > --- a/drivers/gpu/nova-core/gsp/commands.rs > +++ b/drivers/gpu/nova-core/gsp/commands.rs > @@ -225,3 +225,45 @@ pub(crate) fn get_gsp_info(cmdq: &mut Cmdq, bar: &Bar0) > -> Result<GetGspStaticIn > } > } > } > + > +impl CommandToGsp for UnloadingGuestDriver { > + const FUNCTION: MsgFunction = MsgFunction::UnloadingGuestDriver; > + type Command = Self; > + type InitError = Infallible; > + > + fn init(&self) -> impl Init<Self::Command, Self::InitError> { > + *self > + } > +} > + > +pub(crate) struct UnloadingGuestDriverReply; > + > +impl MessageFromGsp for UnloadingGuestDriverReply { > + const FUNCTION: MsgFunction = MsgFunction::UnloadingGuestDriver; > + type InitError = Infallible; > + type Message = (); > + > + fn read( > + _msg: &Self::Message, > + _sbuffer: &mut SBufferIter<array::IntoIter<&[u8], 2>>, > + ) -> Result<Self, Self::InitError> { > + Ok(UnloadingGuestDriverReply) > + } > +} > + > +/// Send the [`UnloadingGuestDriver`] command to the GSP and await the reply. > +pub(crate) fn unload_guest_driver( > + cmdq: &mut Cmdq, > + bar: &Bar0, > + suspend: bool, > +) -> Result<UnloadingGuestDriverReply> { > + cmdq.send_command(bar, UnloadingGuestDriver::new(suspend))?; > + > + loop { > + match > cmdq.receive_msg::<UnloadingGuestDriverReply>(Delta::from_secs(5)) { > + Ok(resp) => return Ok(resp), > + Err(ERANGE) => continue, > + Err(e) => return Err(e), > + } This outer loop can go on infinitely, lets bound it? Either outer timeout or bounded number of tries. Bounded tries better since it has inner timeout. > + } > +} > diff --git a/drivers/gpu/nova-core/gsp/fw.rs b/drivers/gpu/nova-core/gsp/fw.rs > index 09549f7db52d..228464fd47a0 100644 > --- a/drivers/gpu/nova-core/gsp/fw.rs > +++ b/drivers/gpu/nova-core/gsp/fw.rs > @@ -209,6 +209,7 @@ pub(crate) enum MsgFunction { > GspInitPostObjGpu = bindings::NV_VGPU_MSG_FUNCTION_GSP_INIT_POST_OBJGPU, > GspRmControl = bindings::NV_VGPU_MSG_FUNCTION_GSP_RM_CONTROL, > GetStaticInfo = bindings::NV_VGPU_MSG_FUNCTION_GET_STATIC_INFO, > + UnloadingGuestDriver = > bindings::NV_VGPU_MSG_FUNCTION_UNLOADING_GUEST_DRIVER, > > // Event codes > GspInitDone = bindings::NV_VGPU_MSG_EVENT_GSP_INIT_DONE, > @@ -249,6 +250,9 @@ fn try_from(value: u32) -> Result<MsgFunction> { > } > bindings::NV_VGPU_MSG_FUNCTION_GSP_RM_CONTROL => > Ok(MsgFunction::GspRmControl), > bindings::NV_VGPU_MSG_FUNCTION_GET_STATIC_INFO => > Ok(MsgFunction::GetStaticInfo), > + bindings::NV_VGPU_MSG_FUNCTION_UNLOADING_GUEST_DRIVER => { > + Ok(MsgFunction::UnloadingGuestDriver) > + } > bindings::NV_VGPU_MSG_EVENT_GSP_INIT_DONE => > Ok(MsgFunction::GspInitDone), > bindings::NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER => { > Ok(MsgFunction::GspRunCpuSequencer) > diff --git a/drivers/gpu/nova-core/gsp/fw/commands.rs > b/drivers/gpu/nova-core/gsp/fw/commands.rs > index 85465521de32..c7df4783ad21 100644 > --- a/drivers/gpu/nova-core/gsp/fw/commands.rs > +++ b/drivers/gpu/nova-core/gsp/fw/commands.rs > @@ -125,3 +125,30 @@ unsafe impl AsBytes for GspStaticConfigInfo {} > // SAFETY: This struct only contains integer types for which all bit patterns > // are valid. > unsafe impl FromBytes for GspStaticConfigInfo {} > + > +/// Payload of the `UnloadingGuestDriver` command and message. > +#[repr(transparent)] > +#[derive(Clone, Copy, Debug, Zeroable)] > +pub(crate) struct UnloadingGuestDriver { > + inner: bindings::rpc_unloading_guest_driver_v1F_07, > +} > + > +impl UnloadingGuestDriver { > + pub(crate) fn new(suspend: bool) -> Self { > + Self { > + inner: bindings::rpc_unloading_guest_driver_v1F_07 { We should go through intermediate firmware representation than direct bindings access? > + bInPMTransition: if suspend { 1 } else { 0 }, Then this can just be passed as a bool. > + bGc6Entering: 0, > + newLevel: if suspend { 3 } else { 0 }, > + ..Zeroable::zeroed() > + }, > + } > + } > +} > + > +// SAFETY: Padding is explicit and will not contain uninitialized data. > +unsafe impl AsBytes for UnloadingGuestDriver {} > + > +// SAFETY: This struct only contains integer types for which all bit patterns > +// are valid. > +unsafe impl FromBytes for UnloadingGuestDriver {} > diff --git a/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs > b/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs > index 6d25fe0bffa9..212ccc13c0cc 100644 > --- a/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs > +++ b/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs > @@ -879,6 +879,14 @@ fn default() -> Self { > } > } > #[repr(C)] > +#[derive(Debug, Default, Copy, Clone, MaybeZeroable)] > +pub struct rpc_unloading_guest_driver_v1F_07 { > + pub bInPMTransition: u8_, > + pub bGc6Entering: u8_, > + pub __bindgen_padding_0: [u8; 2usize], > + pub newLevel: u32_, When these are intermediate represented, documentation would be nice on the fields. thanks, - Joel > +} > +#[repr(C)] > #[derive(Debug, Default, MaybeZeroable)] > pub struct rpc_run_cpu_sequencer_v17_00 { > pub bufferSizeDWord: u32_, > > -- > 2.52.0 >
