On Sat Nov 15, 2025 at 8:30 AM JST, Timur Tabi wrote:
> Turing and GA100 use programmed I/O (PIO) instead of DMA to upload
> firmware images into Falcon memory.
>
> A new firmware called the Generic Bootloader (as opposed to the
> GSP Bootloader) is used to upload FWSEC.
>
> Signed-off-by: Timur Tabi <[email protected]>
> ---
> drivers/gpu/nova-core/falcon.rs | 181 ++++++++++++++++++++++++
> drivers/gpu/nova-core/firmware.rs | 4 +-
> drivers/gpu/nova-core/firmware/fwsec.rs | 112 ++++++++++++++-
> drivers/gpu/nova-core/gsp/boot.rs | 10 +-
> 4 files changed, 299 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/nova-core/falcon.rs b/drivers/gpu/nova-core/falcon.rs
> index 7af32f65ba5f..f9a4a35b7569 100644
> --- a/drivers/gpu/nova-core/falcon.rs
> +++ b/drivers/gpu/nova-core/falcon.rs
> @@ -20,6 +20,10 @@
> use crate::{
> dma::DmaObject,
> driver::Bar0,
> + firmware::fwsec::{
> + BootloaderDmemDescV2,
> + GenericBootloader, //
> + },
> gpu::Chipset,
> num::{
> FromSafeCast,
> @@ -400,6 +404,183 @@ pub(crate) fn reset(&self, bar: &Bar0) -> Result {
> Ok(())
> }
>
> +
> + /// See nvkm_falcon_pio_wr - takes a byte array instead of a
> FalconFirmware
> + fn pio_wr_bytes(
> + &self,
> + bar: &Bar0,
> + source: *const u8,
> + mem_base: u16,
> + length: usize,
We will definitely want to combine `source` and `length` into a
convenient `&[u8]`. Now I understand why you used a pointer here,
because we need to write an instance of `BootloaderDmemDescV2`, and also
because we use data from a `CoherentAllocation`.
The first one is easy to fix: `BootloaderDmemDescV2` is just a bunch of
integers, so you can implement `AsBytes` on it and get a nice slice of
bytes exactly as we want.
> + target_mem: FalconMem,
> + port: u8,
> + tag: u16
> + ) -> Result {
> + // To avoid unnecessary complication in the write loop, make sure
> the buffer
> + // length is aligned. It always is, which is why an assertion is
> okay.
> + assert!((length % 4) == 0);
Let's return an error instead of panicking here.
> +
> + // From now on, we treat the data as an array of u32
> +
> + let length = length / 4;
> + let mut remaining_len: usize = length;
> + let mut img_offset: usize = 0;
> + let mut tag = tag;
> +
> + // Get data as a slice of u32s
> + let img = unsafe {
> + core::slice::from_raw_parts(source as *const u32, length)
> + };
> +
> + match target_mem {
> + FalconMem::ImemSec | FalconMem::ImemNs => {
> + regs::NV_PFALCON_FALCON_IMEMC::default()
> + .set_secure(target_mem == FalconMem::ImemSec)
> + .set_aincw(true)
> + .set_offs(mem_base)
> + .write(bar, &E::ID, port as usize);
> + },
> + FalconMem::Dmem => {
> + // gm200_flcn_pio_dmem_wr_init
Probably a stray development-time comment.
> + regs::NV_PFALCON_FALCON_DMEMC::default()
> + .set_aincw(true)
> + .set_offs(mem_base)
> + .write(bar, &E::ID, port as usize);
> + },
> + }
> +
> + while remaining_len > 0 {
> + let xfer_len = core::cmp::min(remaining_len, 256 / 4); //
> pio->max = 256
> +
> + // Perform the PIO write for the next 256 bytes. Each tag
> represents
> + // a 256-byte block in IMEM/DMEM.
> + let mut len = xfer_len;
> +
> + match target_mem {
> + FalconMem::ImemSec | FalconMem::ImemNs => {
> + regs::NV_PFALCON_FALCON_IMEMT::default()
> + .set_tag(tag)
> + .write(bar, &E::ID, port as usize);
> +
> + while len > 0 {
> + regs::NV_PFALCON_FALCON_IMEMD::default()
> + .set_data(img[img_offset])
> + .write(bar, &E::ID, port as usize);
> + img_offset += 1;
> + len -= 1;
> + };
> +
> + tag += 1;
> + },
> + FalconMem::Dmem => {
> + // tag is ignored for DMEM
> + while len > 0 {
> + regs::NV_PFALCON_FALCON_DMEMD::default()
> + .set_data(img[img_offset])
> + .write(bar, &E::ID, port as usize);
> + img_offset += 1;
> + len -= 1;
> + };
> + },
> + }
> +
> + remaining_len -= xfer_len;
> + }
Let's turn this C-style loop into something more Rustey.
We want to divide the input twice: once in 256 bytes block to write the
Imem tag if needed, and then again in blocks of `u32`. Nova being
little-endian, we can assume that ordering. This lets us leverage
`chunks` and `from_bytes`.
I got the following (untested) code, which assumes `source` is the
`&[u8]` we want to write:
// Length of an IMEM tag in bytes.
const IMEM_TAG_LEN: usize = 256;
for chunk in source.chunks(IMEM_TAG_LEN) {
// Convert our chunk of bytes into an array of u32s.
//
// This can never fail as the sizes match, but propagate the error
// to avoid an `unsafe` statement.
let chunk = <[u32; IMEM_TAG_LEN /
size_of::<u32>()]>::from_bytes(chunk)?;
match target_mem {
FalconMem::Imem { .. } => {
regs::NV_PFALCON_FALCON_IMEMT::default().set_tag(tag).write(
bar,
&E::ID,
port as usize,
);
tag += 1;
for &data in chunk {
regs::NV_PFALCON_FALCON_IMEMD::default()
.set_data(data)
.write(bar, &E::ID, port as usize);
}
}
FalconMem::Dmem => {
for &data in chunk {
regs::NV_PFALCON_FALCON_DMEMD::default()
.set_data(data)
.write(bar, &E::ID, port as usize);
}
}
}
}
The cool thing is that this removes all the mutable variables and
counters, with the exception of `tag`. It is also shorter and more
explicit about the intent IMHO.
> +
> + Ok(())
> + }
> +
> + /// See nvkm_falcon_pio_wr
This doc isn't really helpful - why is this method needed at all?
It appears to be because we pass the firmware data as a
`CoherentAllocation`, which PIO loading can not work with directly since
it bitbangs the data to load instead of using DMA.
But `pio_wr` is only ever called from `pio_load`, so `pio_load` could
call the `as_slice` method of `CoherentAllocation` to obtain a slice and
work with `pio_wr_bytes` directly, removing the need for this method.
> + fn pio_wr<F: FalconFirmware<Target = E>>(
> + &self,
> + bar: &Bar0,
> + fw: &F,
> + target_mem: FalconMem,
> + load_offsets: &FalconLoadTarget,
> + port: u8,
> + tag: u16,
> + ) -> Result {
> + // FIXME: There's probably a better way to create a pointer to
> inside the firmware
> + // Maybe CoherentAllocation needs to implement a method for that.
> + let start = unsafe { fw.start_ptr().add(load_offsets.src_start as
> usize) };
Yes, `as_slice` will give you a slice that you can pass directly to the
updated `pio_wr_bytes`:
let fw_bytes = unsafe { fw.as_slice(0, fw.size())? };
> + self.pio_wr_bytes(bar, start,
> + load_offsets.dst_start as u16,
> + load_offsets.len as usize, target_mem, port, tag)
> + }
> +
> + /// Perform a PIO copy into `IMEM` and `DMEM` of `fw`, and prepare the
> falcon to run it.
> + pub(crate) fn pio_load<F: FalconFirmware<Target = E>>(
> + &self,
> + bar: &Bar0,
> + fw: &F,
> + gbl: Option<&GenericBootloader>
> + ) -> Result {
> + let imem_sec = fw.imem_sec_load_params();
> + let imem_ns = fw.imem_ns_load_params().unwrap();
Let's return an error in this case instead of panicking.
> + let dmem = fw.dmem_load_params();
> +
> + regs::NV_PFALCON_FBIF_CTL::read(bar, &E::ID)
> + .set_allow_phys_no_ctx(true)
> + .write(bar, &E::ID);
> +
> + regs::NV_PFALCON_FALCON_DMACTL::default()
> + .write(bar, &E::ID);
> +
> + // If the Generic Bootloader was passed, then use it to boot FRTS
> + if let Some(gbl) = gbl {
> + let load_params = FalconLoadTarget {
> + src_start: 0,
> + dst_start: 0x10000 - gbl.desc.code_size,
> + len: gbl.desc.code_size,
> + };
> + self.pio_wr_bytes(bar, gbl.ucode.as_ptr(),
> + load_params.dst_start as u16, load_params.len as usize,
> + FalconMem::ImemNs, 0, gbl.desc.start_tag as u16)?;
> +
> + // This structure tells the generic bootloader where to find the
> FWSEC
> + // image.
> + let dmem_desc = BootloaderDmemDescV2 {
> + reserved: [0; 4],
> + signature: [0; 4],
> + ctx_dma: 4, // FALCON_DMAIDX_PHYS_SYS_NCOH
> + code_dma_base: fw.dma_handle(),
> + non_sec_code_off: imem_ns.dst_start,
> + non_sec_code_size: imem_ns.len,
> + sec_code_off: imem_sec.dst_start,
> + sec_code_size: imem_sec.len,
> + code_entry_point: 0,
> + data_dma_base: fw.dma_handle() + dmem.src_start as u64,
> + data_size: dmem.len,
> + argc: 0,
> + argv: 0,
> + };
> +
> + regs::NV_PFALCON_FBIF_TRANSCFG::update(bar, &E::ID, 4, |v| {
> + v.set_target(FalconFbifTarget::CoherentSysmem)
> + .set_mem_type(FalconFbifMemType::Physical)
> + });
> +
> + self.pio_wr_bytes(bar, &dmem_desc as *const _ as *const u8, 0,
> + core::mem::size_of::<BootloaderDmemDescV2>(),
> + FalconMem::Dmem, 0, 0)?;
Once you have `AsBytes` implemented on BootloaderDmemDescV2, you can
just do
self.pio_wr_bytes(bar, dmem_desc.as_bytes(), 0, FalconMem::Dmem, 0, 0)?;
> + } else {
> + self.pio_wr(bar, fw, FalconMem::ImemNs, &imem_ns, 0,
> + u16::try_from(imem_ns.dst_start >> 8)?)?;
> + self.pio_wr(bar, fw, FalconMem::ImemSec, &imem_sec, 0,
> + u16::try_from(imem_sec.dst_start >> 8)?)?;
> + self.pio_wr(bar, fw, FalconMem::Dmem, &dmem, 0, 0)?;
> + }
> +
> +
> + self.hal.program_brom(self, bar, &fw.brom_params())?;
> + // Set `BootVec` to start of non-secure code.
> + regs::NV_PFALCON_FALCON_BOOTVEC::default()
> + .set_value(fw.boot_addr())
> + .write(bar, &E::ID);
> +
> + Ok(())
> + }
> +
> /// Perform a DMA write according to `load_offsets` from `dma_handle`
> into the falcon's
> /// `target_mem`.
> ///
> diff --git a/drivers/gpu/nova-core/firmware.rs
> b/drivers/gpu/nova-core/firmware.rs
> index 5ca5bf1fb610..ecab16af0166 100644
> --- a/drivers/gpu/nova-core/firmware.rs
> +++ b/drivers/gpu/nova-core/firmware.rs
> @@ -31,7 +31,7 @@
> pub(crate) const FIRMWARE_VERSION: &str = "570.144";
>
> /// Requests the GPU firmware `name` suitable for `chipset`, with version
> `ver`.
> -fn request_firmware(
> +pub(crate) fn request_firmware(
This isn't needed, `request_firmware` is only ever called from child
modules, which can access the private members of their parents.
> dev: &device::Device,
> chipset: gpu::Chipset,
> name: &str,
> @@ -252,7 +252,7 @@ fn no_patch_signature(self) -> FirmwareDmaObject<F,
> Signed> {
> /// Header common to most firmware files.
> #[repr(C)]
> #[derive(Debug, Clone)]
> -struct BinHdr {
> +pub(crate) struct BinHdr {
Same here.