Add a new VFIO PCI driver for NVIDIA GPUs that enables DMA testing via the Falcon (Fast Logic Controller) microcontrollers. This driver extracts and adapts the DMA test functionality from the NVIDIA gpu-admin-tools project and integrates it into the existing VFIO selftest framework.
The Falcon is a general-purpose microcontroller present on NVIDIA GPUs that can perform DMA operations between system memory and device memory. By leveraging Falcon DMA, this driver allows NVIDIA GPUs to be tested alongside Intel IOAT and DSA devices using the same selftest infrastructure. Supported GPUs: - Kepler: K520, GTX660, K4000, K80, GT635 - Maxwell Gen1: GTX750, GTX745 - Maxwell Gen2: M60 - Pascal: P100, P4, P40 - Volta: V100 - Turing: T4 - Ampere: A16, A100, A10 - Ada: L4, L40S - Hopper: H100 The PMU falcon on Kepler and Maxwell Gen1 GPUs uses legacy FBIF register offsets and requires enabling via PMC_ENABLE with the HUB bit set. Limitations and tradeoffs: 1. Architecture support: Blackwell and newer architectures may require additional work due to firmware. 2. Synchronous DMA operations: Each transfer blocks until completion because the reference implementation does not expose command queuing - only one DMA operation can be in flight at a time. The driver is named 'nv_falcon' to reflect that it specifically controls the Falcon microcontroller for DMA operations, rather than exposing general GPU functionality. Reference implementation: https://github.com/NVIDIA/gpu-admin-tools Signed-off-by: Rubin Du <[email protected]> --- .../vfio/lib/drivers/nv_falcons/hw.h | 340 ++++++++ .../vfio/lib/drivers/nv_falcons/nv_falcons.c | 750 ++++++++++++++++++ tools/testing/selftests/vfio/lib/libvfio.mk | 2 + .../selftests/vfio/lib/vfio_pci_driver.c | 3 + 4 files changed, 1095 insertions(+) create mode 100644 tools/testing/selftests/vfio/lib/drivers/nv_falcons/hw.h create mode 100644 tools/testing/selftests/vfio/lib/drivers/nv_falcons/nv_falcons.c diff --git a/tools/testing/selftests/vfio/lib/drivers/nv_falcons/hw.h b/tools/testing/selftests/vfio/lib/drivers/nv_falcons/hw.h new file mode 100644 index 000000000000..30206a586c2e --- /dev/null +++ b/tools/testing/selftests/vfio/lib/drivers/nv_falcons/hw.h @@ -0,0 +1,340 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ +#ifndef FALCON_DMA_H +#define FALCON_DMA_H + +/* PMC (Power Management Controller) Registers */ +#define NV_PMC_BOOT_0 0x00000000 +#define NV_PMC_ENABLE 0x00000200 +#define NV_PMC_ENABLE_PWR 0x00002000 +#define NV_PMC_ENABLE_HUB 0x20000000 + +/* Falcon Base Pages for Different Engines */ +#define NV_PPWR_FALCON_BASE 0x10a000 +#define NV_PGSP_FALCON_BASE 0x110000 + +/* Falcon Common Register Offsets (relative to base_page) */ +#define NV_FALCON_DMACTL_OFFSET 0x010c +#define NV_FALCON_CPUCTL_OFFSET 0x0100 +#define NV_FALCON_ENGINE_RESET_OFFSET 0x03c0 + +/* DMEM Control Register Flags */ +#define NV_PPWR_FALCON_DMEMC_AINCR_TRUE 0x01000000 +#define NV_PPWR_FALCON_DMEMC_AINCW_TRUE 0x02000000 + +/* Falcon DMEM port offsets (for port 0) */ +#define NV_FALCON_DMEMC_OFFSET 0x1c0 +#define NV_FALCON_DMEMD_OFFSET 0x1c4 + +/* DMA Register Offsets (relative to base_page) */ +#define NV_FALCON_DMA_ADDR_LOW_OFFSET 0x110 +#define NV_FALCON_DMA_MEM_OFFSET 0x114 +#define NV_FALCON_DMA_CMD_OFFSET 0x118 +#define NV_FALCON_DMA_BLOCK_OFFSET 0x11c +#define NV_FALCON_DMA_ADDR_HIGH_OFFSET 0x128 + +/* DMA Global Address Top Bits Register */ +#define NV_GPU_DMA_ADDR_TOP_BITS_REG 0x100f04 + +/* DMA Command Register Bit Definitions */ +#define NV_FALCON_DMA_CMD_WRITE_BIT 0x20 +#define NV_FALCON_DMA_CMD_SIZE_SHIFT 8 +#define NV_FALCON_DMA_CMD_DONE_BIT 0x2 + +/* DMA block size and alignment */ +#define NV_FALCON_DMA_MAX_TRANSFER_SIZE 256 +#define NV_FALCON_DMA_MAX_TRANSFER_COUNT 1 + +/* DMACTL register bits */ +#define NV_FALCON_DMACTL_DMEM_SCRUBBING 0x1 +#define NV_FALCON_DMACTL_READY_MASK 0x6 + +/* Falcon Core Selection Register */ +#define NV_FALCON_CORE_SELECT_OFFSET 0x1668 +#define NV_FALCON_CORE_SELECT_MASK 0x30 + +/* Falcon mailbox register (for Ada+ reset check) */ +#define NV_FALCON_MAILBOX_TEST_OFFSET 0x40c +#define NV_FALCON_MAILBOX_RESET_MAGIC 0xbadf5620 + +/* Falcon Message Queue Register Offsets (relative to base_page) */ +#define NV_FALCON_QUEUE_HEAD_BASE_OFFSET 0x2c00 +#define NV_FALCON_QUEUE_TAIL_BASE_OFFSET 0x2c04 +#define NV_FALCON_QUEUE_STRIDE 0x8 +#define NV_FALCON_MSG_QUEUE_HEAD_BASE_OFFSET 0x2c80 +#define NV_FALCON_MSG_QUEUE_TAIL_BASE_OFFSET 0x2c84 + +/* FSP Falcon Base Pages */ +#define NV_FSP_FALCON_BASE 0x8f0100 +#define NV_FSP_FALCON_BASE_PAGE 0x8f0000 /* base_page = cpuctl & ~0xfff */ +#define NV_FSP_EMEM_BASE 0x8f2000 + +/* FSP EMEM Port Offsets (relative to FSP EMEM base) */ +#define NV_FSP_EMEMC_OFFSET 0xac0 +#define NV_FSP_EMEMD_OFFSET 0xac4 +#define NV_FSP_EMEM_PORT_STRIDE 0x8 + +/* EMEM Control Register Flags (same as DMEM) */ +#define NV_FALCON_EMEMC_AINCR 0x01000000 +#define NV_FALCON_EMEMC_AINCW 0x02000000 + +/* FSP RPC channel configuration */ +#define NV_FSP_RPC_CHANNEL_SIZE 1024 +#define NV_FSP_RPC_MAX_PACKET_SIZE 1024 +#define NV_FSP_RPC_CHANNEL_HOPPER 2 +#define NV_FSP_RPC_EMEM_BASE (NV_FSP_RPC_CHANNEL_HOPPER * NV_FSP_RPC_CHANNEL_SIZE) + +/* FSP EMEM port 2 registers (pre-computed for Hopper channel 2) */ +#define NV_FSP_EMEM_PORT2_CTRL (NV_FSP_EMEM_BASE + NV_FSP_EMEMC_OFFSET + \ + NV_FSP_RPC_CHANNEL_HOPPER * NV_FSP_EMEM_PORT_STRIDE) +#define NV_FSP_EMEM_PORT2_DATA (NV_FSP_EMEM_BASE + NV_FSP_EMEMD_OFFSET + \ + NV_FSP_RPC_CHANNEL_HOPPER * NV_FSP_EMEM_PORT_STRIDE) + +/* FSP queue register offsets (pre-computed for Hopper channel 2) */ +#define NV_FSP_QUEUE_HEAD (NV_FSP_FALCON_BASE_PAGE + NV_FALCON_QUEUE_HEAD_BASE_OFFSET + \ + NV_FSP_RPC_CHANNEL_HOPPER * NV_FALCON_QUEUE_STRIDE) +#define NV_FSP_QUEUE_TAIL (NV_FSP_FALCON_BASE_PAGE + NV_FALCON_QUEUE_TAIL_BASE_OFFSET + \ + NV_FSP_RPC_CHANNEL_HOPPER * NV_FALCON_QUEUE_STRIDE) +#define NV_FSP_MSG_QUEUE_HEAD (NV_FSP_FALCON_BASE_PAGE + NV_FALCON_MSG_QUEUE_HEAD_BASE_OFFSET + \ + NV_FSP_RPC_CHANNEL_HOPPER * NV_FALCON_QUEUE_STRIDE) +#define NV_FSP_MSG_QUEUE_TAIL (NV_FSP_FALCON_BASE_PAGE + NV_FALCON_MSG_QUEUE_TAIL_BASE_OFFSET + \ + NV_FSP_RPC_CHANNEL_HOPPER * NV_FALCON_QUEUE_STRIDE) + +/* MCTP Header */ +#define NV_MCTP_HDR_SEID_SHIFT 16 +#define NV_MCTP_HDR_SEID_MASK 0xff +#define NV_MCTP_HDR_SEQ_SHIFT 28 +#define NV_MCTP_HDR_SEQ_MASK 0x3 +#define NV_MCTP_HDR_EOM_BIT 0x40000000 +#define NV_MCTP_HDR_SOM_BIT 0x80000000 + +/* MCTP Message Header */ +#define NV_MCTP_MSG_TYPE_SHIFT 0 +#define NV_MCTP_MSG_TYPE_MASK 0x7f +#define NV_MCTP_MSG_TYPE_VENDOR_DEFINED 0x7e +#define NV_MCTP_MSG_VENDOR_ID_SHIFT 8 +#define NV_MCTP_MSG_VENDOR_ID_MASK 0xffff +#define NV_MCTP_MSG_VENDOR_ID_NVIDIA 0x10de +#define NV_MCTP_MSG_NVDM_TYPE_SHIFT 24 +#define NV_MCTP_MSG_NVDM_TYPE_MASK 0xff + +/* NVDM response type */ +#define NV_NVDM_TYPE_RESPONSE 0x15 + +/* Minimum response size: mctp_hdr + msg_hdr + status_hdr + type + status */ +#define NV_FSP_RPC_MIN_RESPONSE_WORDS 5 + +/* FBIF (Frame Buffer Interface) Registers */ +/* Legacy PMU FBIF offsets (Kepler, Maxwell Gen1) */ +#define NV_PMU_LEGACY_FBIF_CTL_OFFSET 0x624 +#define NV_PMU_LEGACY_FBIF_TRANSCFG_OFFSET 0x600 + +/* PMU FBIF offsets */ +#define NV_PMU_FBIF_CTL_OFFSET 0xe24 +#define NV_PMU_FBIF_TRANSCFG_OFFSET 0xe00 + +/* GSP FBIF offsets */ +#define NV_GSP_FBIF_CTL_OFFSET 0x624 +#define NV_GSP_FBIF_TRANSCFG_OFFSET 0x600 + +/* OFA Falcon Base Page and FBIF offsets (used for Hopper+ DMA) */ +#define NV_OFA_FALCON_BASE 0x844000 +#define NV_OFA_FBIF_CTL_OFFSET 0x424 +#define NV_OFA_FBIF_TRANSCFG_OFFSET 0x400 + +/* OFA DMA support check register (Hopper+) */ +#define NV_OFA_DMA_SUPPORT_CHECK_REG 0x8443c0 + +/* FSP NVDM command types */ +#define NV_NVDM_TYPE_FBDMA 0x22 +#define NV_FBDMA_SUBCMD_ENABLE 0x1 + +/* FBIF CTL2 offset (relative to fbif_ctl) */ +#define NV_FBIF_CTL2_OFFSET 0x60 + +/* FBIF TRANSCFG register bits */ +#define NV_FBIF_TRANSCFG_TARGET_MASK 0x3 +#define NV_FBIF_TRANSCFG_SYSMEM_DEFAULT 0x5 + +/* FBIF CTL register bits */ +#define NV_FBIF_CTL_ALLOW_PHYS_MODE 0x10 +#define NV_FBIF_CTL_ALLOW_FULL_PHYS_MODE 0x80 + +/* Memory clear register offsets */ +#define NV_MEM_CLEAR_OFFSET 0x100b20 +#define NV_BOOT_COMPLETE_OFFSET 0x118234 +#define NV_BOOT_COMPLETE_MASK 0x3ff + +/* FSP boot complete register (Hopper+) */ +#define NV_FSP_BOOT_COMPLETE_OFFSET 0x200bc +#define NV_FSP_BOOT_COMPLETE_MASK 0xff + +enum gpu_arch { + GPU_ARCH_UNKNOWN = -1, + GPU_ARCH_KEPLER = 0, + GPU_ARCH_MAXWELL_GEN1, + GPU_ARCH_MAXWELL_GEN2, + GPU_ARCH_PASCAL, + GPU_ARCH_VOLTA, + GPU_ARCH_TURING, + GPU_ARCH_AMPERE, + GPU_ARCH_ADA, + GPU_ARCH_HOPPER, +}; + +enum falcon_type { + FALCON_TYPE_PMU_LEGACY = 0, + FALCON_TYPE_PMU, + FALCON_TYPE_GSP, + FALCON_TYPE_OFA, +}; + +struct falcon { + u32 base_page; + u32 dmactl; + u32 engine_reset; + u32 fbif_ctl; + u32 fbif_ctl2; + u32 fbif_transcfg; + u32 dmem_control_reg; + u32 dmem_data_reg; + bool no_outside_reset; +}; + +struct gpu_properties { + u32 pmc_enable_mask; + bool memory_clear_supported; + enum falcon_type falcon_type; +}; + +struct gpu_device { + enum gpu_arch arch; + void *bar0; + bool is_memory_clear_supported; + const struct falcon *falcon; + u32 pmc_enable_mask; + bool fsp_dma_enabled; +}; + +static const u32 verified_gpu_map[] = { + 0x0e40a0a2, /* K520 */ + 0x0e6000a1, /* GTX660 */ + 0x0e63a0a1, /* K4000 */ + 0x0f22d0a1, /* K80 */ + 0x108000a1, /* GT635 */ + 0x117010a2, /* GTX750 */ + 0x117020a2, /* GTX745 */ + 0x124320a1, /* M60 */ + 0x130000a1, /* P100 */ + 0x134000a1, /* P4 */ + 0x132000a1, /* P40 */ + 0x140000a1, /* V100 */ + 0x164000a1, /* T4 */ + 0xb77000a1, /* A16 */ + 0x170000a1, /* A100 */ + 0xb72000a1, /* A10 */ + 0x194000a1, /* L4 */ + 0x192000a1, /* L40S */ +}; + +#define VERIFIED_GPU_MAP_SIZE ARRAY_SIZE(verified_gpu_map) + +static const struct gpu_properties gpu_properties_map[] = { + [GPU_ARCH_KEPLER] = { + .pmc_enable_mask = NV_PMC_ENABLE_PWR | NV_PMC_ENABLE_HUB, + .memory_clear_supported = false, + .falcon_type = FALCON_TYPE_PMU_LEGACY, + }, + [GPU_ARCH_MAXWELL_GEN1] = { + .pmc_enable_mask = NV_PMC_ENABLE_PWR | NV_PMC_ENABLE_HUB, + .memory_clear_supported = false, + .falcon_type = FALCON_TYPE_PMU_LEGACY, + }, + [GPU_ARCH_MAXWELL_GEN2] = { + .pmc_enable_mask = NV_PMC_ENABLE_PWR, + .memory_clear_supported = false, + .falcon_type = FALCON_TYPE_PMU, + }, + [GPU_ARCH_PASCAL] = { + .pmc_enable_mask = NV_PMC_ENABLE_PWR, + .memory_clear_supported = false, + .falcon_type = FALCON_TYPE_PMU, + }, + [GPU_ARCH_VOLTA] = { + .pmc_enable_mask = 0, + .memory_clear_supported = false, + .falcon_type = FALCON_TYPE_GSP, + }, + [GPU_ARCH_TURING] = { + .pmc_enable_mask = 0, + .memory_clear_supported = true, + .falcon_type = FALCON_TYPE_GSP, + }, + [GPU_ARCH_AMPERE] = { + .pmc_enable_mask = 0, + .memory_clear_supported = true, + .falcon_type = FALCON_TYPE_GSP, + }, + [GPU_ARCH_ADA] = { + .pmc_enable_mask = 0, + .memory_clear_supported = true, + .falcon_type = FALCON_TYPE_PMU, + }, + [GPU_ARCH_HOPPER] = { + .pmc_enable_mask = 0, + .memory_clear_supported = true, + .falcon_type = FALCON_TYPE_OFA, + }, +}; + +static const struct falcon falcon_map[] = { + [FALCON_TYPE_PMU_LEGACY] = { + .base_page = NV_PPWR_FALCON_BASE, + .dmactl = NV_PPWR_FALCON_BASE + NV_FALCON_DMACTL_OFFSET, + .engine_reset = NV_PPWR_FALCON_BASE + NV_FALCON_ENGINE_RESET_OFFSET, + .fbif_ctl = NV_PPWR_FALCON_BASE + NV_PMU_LEGACY_FBIF_CTL_OFFSET, + .fbif_ctl2 = NV_PPWR_FALCON_BASE + NV_PMU_LEGACY_FBIF_CTL_OFFSET + NV_FBIF_CTL2_OFFSET, + .fbif_transcfg = NV_PPWR_FALCON_BASE + NV_PMU_LEGACY_FBIF_TRANSCFG_OFFSET, + .dmem_control_reg = NV_PPWR_FALCON_BASE + NV_FALCON_DMEMC_OFFSET, + .dmem_data_reg = NV_PPWR_FALCON_BASE + NV_FALCON_DMEMD_OFFSET, + .no_outside_reset = false, + }, + [FALCON_TYPE_PMU] = { + .base_page = NV_PPWR_FALCON_BASE, + .dmactl = NV_PPWR_FALCON_BASE + NV_FALCON_DMACTL_OFFSET, + .engine_reset = NV_PPWR_FALCON_BASE + NV_FALCON_ENGINE_RESET_OFFSET, + .fbif_ctl = NV_PPWR_FALCON_BASE + NV_PMU_FBIF_CTL_OFFSET, + .fbif_ctl2 = NV_PPWR_FALCON_BASE + NV_PMU_FBIF_CTL_OFFSET + NV_FBIF_CTL2_OFFSET, + .fbif_transcfg = NV_PPWR_FALCON_BASE + NV_PMU_FBIF_TRANSCFG_OFFSET, + .dmem_control_reg = NV_PPWR_FALCON_BASE + NV_FALCON_DMEMC_OFFSET, + .dmem_data_reg = NV_PPWR_FALCON_BASE + NV_FALCON_DMEMD_OFFSET, + .no_outside_reset = false, + }, + [FALCON_TYPE_GSP] = { + .base_page = NV_PGSP_FALCON_BASE, + .dmactl = NV_PGSP_FALCON_BASE + NV_FALCON_DMACTL_OFFSET, + .engine_reset = NV_PGSP_FALCON_BASE + NV_FALCON_ENGINE_RESET_OFFSET, + .fbif_ctl = NV_PGSP_FALCON_BASE + NV_GSP_FBIF_CTL_OFFSET, + .fbif_ctl2 = NV_PGSP_FALCON_BASE + NV_GSP_FBIF_CTL_OFFSET + NV_FBIF_CTL2_OFFSET, + .fbif_transcfg = NV_PGSP_FALCON_BASE + NV_GSP_FBIF_TRANSCFG_OFFSET, + .dmem_control_reg = NV_PGSP_FALCON_BASE + NV_FALCON_DMEMC_OFFSET, + .dmem_data_reg = NV_PGSP_FALCON_BASE + NV_FALCON_DMEMD_OFFSET, + .no_outside_reset = false, + }, + [FALCON_TYPE_OFA] = { + .base_page = NV_OFA_FALCON_BASE, + .dmactl = NV_OFA_FALCON_BASE + NV_FALCON_DMACTL_OFFSET, + .engine_reset = NV_OFA_FALCON_BASE + NV_FALCON_ENGINE_RESET_OFFSET, + .fbif_ctl = NV_OFA_FALCON_BASE + NV_OFA_FBIF_CTL_OFFSET, + .fbif_ctl2 = NV_OFA_FALCON_BASE + NV_OFA_FBIF_CTL_OFFSET + NV_FBIF_CTL2_OFFSET, + .fbif_transcfg = NV_OFA_FALCON_BASE + NV_OFA_FBIF_TRANSCFG_OFFSET, + .dmem_control_reg = NV_OFA_FALCON_BASE + NV_FALCON_DMEMC_OFFSET, + .dmem_data_reg = NV_OFA_FALCON_BASE + NV_FALCON_DMEMD_OFFSET, + .no_outside_reset = true, + }, +}; + + +#endif /* FALCON_DMA_H */ diff --git a/tools/testing/selftests/vfio/lib/drivers/nv_falcons/nv_falcons.c b/tools/testing/selftests/vfio/lib/drivers/nv_falcons/nv_falcons.c new file mode 100644 index 000000000000..1ed7e7336601 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/drivers/nv_falcons/nv_falcons.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ +#include <stdint.h> +#include <strings.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <time.h> + +#include <linux/errno.h> +#include <linux/io.h> +#include <linux/pci_ids.h> + +#include <libvfio.h> + +#include "hw.h" + +static inline struct gpu_device *to_nv_gpu(struct vfio_pci_device *device) +{ + return device->driver.region.vaddr; +} + +static enum gpu_arch nv_gpu_arch_lookup(u32 pmc_boot_0) +{ + u32 arch = (pmc_boot_0 >> 24) & 0x1f; + + switch (arch) { + case 0x0e: + case 0x0f: + case 0x10: + return GPU_ARCH_KEPLER; + case 0x11: + return GPU_ARCH_MAXWELL_GEN1; + case 0x12: + return GPU_ARCH_MAXWELL_GEN2; + case 0x13: + return GPU_ARCH_PASCAL; + case 0x14: + return GPU_ARCH_VOLTA; + case 0x16: + return GPU_ARCH_TURING; + case 0x17: + return GPU_ARCH_AMPERE; + case 0x18: + return GPU_ARCH_HOPPER; + case 0x19: + return GPU_ARCH_ADA; + default: + return GPU_ARCH_UNKNOWN; + } +} + +static inline u32 gpu_read32(struct gpu_device *gpu, u32 offset) +{ + return readl(gpu->bar0 + offset); +} + +static inline void gpu_write32(struct gpu_device *gpu, u32 offset, u32 value) +{ + writel(value, gpu->bar0 + offset); +} + +static int gpu_poll_register(struct vfio_pci_device *device, + const char *name, u32 offset, + u32 expected, u32 mask, u32 timeout_ms) +{ + struct gpu_device *gpu = to_nv_gpu(device); + u32 value; + struct timespec start, now; + u64 elapsed_ms; + + clock_gettime(CLOCK_MONOTONIC, &start); + + for (;;) { + value = gpu_read32(gpu, offset); + if ((value & mask) == expected) + return 0; + + clock_gettime(CLOCK_MONOTONIC, &now); + elapsed_ms = (now.tv_sec - start.tv_sec) * 1000 + + (now.tv_nsec - start.tv_nsec) / 1000000; + + if (elapsed_ms >= timeout_ms) + break; + + usleep(1000); + } + + dev_err(device, + "Timeout polling %s (0x%x): value=0x%x expected=0x%x mask=0x%x after %llu ms\n", + name, offset, value, expected, mask, + (unsigned long long)elapsed_ms); + return -ETIMEDOUT; +} + +static int fsp_poll_queue(struct gpu_device *gpu, u32 head_reg, u32 tail_reg, + bool wait_empty, u32 timeout_ms) +{ + struct timespec start, now; + u64 elapsed_ms; + u32 head, tail; + + clock_gettime(CLOCK_MONOTONIC, &start); + + for (;;) { + head = gpu_read32(gpu, head_reg); + tail = gpu_read32(gpu, tail_reg); + if (wait_empty ? (head == tail) : (head != tail)) + return 0; + + clock_gettime(CLOCK_MONOTONIC, &now); + elapsed_ms = (now.tv_sec - start.tv_sec) * 1000 + + (now.tv_nsec - start.tv_nsec) / 1000000; + + if (elapsed_ms >= timeout_ms) + return -ETIMEDOUT; + + usleep(1000); + } +} + +static void fsp_emem_write(struct gpu_device *gpu, u32 offset, + const u32 *data, u32 count) +{ + u32 i; + + /* Configure port with auto-increment for read and write */ + gpu_write32(gpu, NV_FSP_EMEM_PORT2_CTRL, + offset | NV_FALCON_EMEMC_AINCR | NV_FALCON_EMEMC_AINCW); + + for (i = 0; i < count; i++) + gpu_write32(gpu, NV_FSP_EMEM_PORT2_DATA, data[i]); +} + +static void fsp_emem_read(struct gpu_device *gpu, u32 offset, + u32 *data, u32 count) +{ + u32 i; + + /* Configure port with auto-increment for read and write */ + gpu_write32(gpu, NV_FSP_EMEM_PORT2_CTRL, + offset | NV_FALCON_EMEMC_AINCR | NV_FALCON_EMEMC_AINCW); + + for (i = 0; i < count; i++) + data[i] = gpu_read32(gpu, NV_FSP_EMEM_PORT2_DATA); +} + +static int fsp_rpc_send_data(struct gpu_device *gpu, const u32 *data, u32 count) +{ + int ret; + + ret = fsp_poll_queue(gpu, NV_FSP_QUEUE_HEAD, NV_FSP_QUEUE_TAIL, true, 1000); + if (ret) + return ret; + + fsp_emem_write(gpu, NV_FSP_RPC_EMEM_BASE, data, count); + + /* Update queue head/tail to signal data is ready */ + gpu_write32(gpu, NV_FSP_QUEUE_TAIL, + NV_FSP_RPC_EMEM_BASE + (count - 1) * 4); + gpu_write32(gpu, NV_FSP_QUEUE_HEAD, NV_FSP_RPC_EMEM_BASE); + + return 0; +} + +static int fsp_rpc_receive_data(struct gpu_device *gpu, u32 *data, + u32 max_count, u32 timeout_ms) +{ + u32 head, tail; + u32 msg_size_words; + int ret; + + ret = fsp_poll_queue(gpu, NV_FSP_MSG_QUEUE_HEAD, NV_FSP_MSG_QUEUE_TAIL, + false, timeout_ms); + if (ret) + return ret; + + head = gpu_read32(gpu, NV_FSP_MSG_QUEUE_HEAD); + tail = gpu_read32(gpu, NV_FSP_MSG_QUEUE_TAIL); + + msg_size_words = (tail - head + 4) / 4; + if (msg_size_words > max_count) + msg_size_words = max_count; + + fsp_emem_read(gpu, NV_FSP_RPC_EMEM_BASE, data, msg_size_words); + + /* Reset message queue tail to acknowledge receipt */ + gpu_write32(gpu, NV_FSP_MSG_QUEUE_TAIL, head); + + return msg_size_words; +} + +static void fsp_reset_rpc_state(struct vfio_pci_device *device) +{ + struct gpu_device *gpu = to_nv_gpu(device); + u32 head, tail; + + head = gpu_read32(gpu, NV_FSP_QUEUE_HEAD); + tail = gpu_read32(gpu, NV_FSP_QUEUE_TAIL); + + if (head == tail) { + head = gpu_read32(gpu, NV_FSP_MSG_QUEUE_HEAD); + tail = gpu_read32(gpu, NV_FSP_MSG_QUEUE_TAIL); + if (head == tail) + return; + } + + fsp_poll_queue(gpu, NV_FSP_MSG_QUEUE_HEAD, NV_FSP_MSG_QUEUE_TAIL, false, 5000); + + gpu_write32(gpu, NV_FSP_QUEUE_TAIL, NV_FSP_RPC_EMEM_BASE); + gpu_write32(gpu, NV_FSP_QUEUE_HEAD, NV_FSP_RPC_EMEM_BASE); + gpu_write32(gpu, NV_FSP_MSG_QUEUE_TAIL, NV_FSP_RPC_EMEM_BASE); + gpu_write32(gpu, NV_FSP_MSG_QUEUE_HEAD, NV_FSP_RPC_EMEM_BASE); +} + +static inline u32 mctp_header_build(u8 seid, u8 seq, bool som, bool eom) +{ + u32 hdr = 0; + + hdr |= (seid & NV_MCTP_HDR_SEID_MASK) << NV_MCTP_HDR_SEID_SHIFT; + hdr |= (seq & NV_MCTP_HDR_SEQ_MASK) << NV_MCTP_HDR_SEQ_SHIFT; + if (som) + hdr |= NV_MCTP_HDR_SOM_BIT; + if (eom) + hdr |= NV_MCTP_HDR_EOM_BIT; + + return hdr; +} + +static inline u32 mctp_msg_header_build(u8 nvdm_type) +{ + u32 hdr = 0; + + hdr |= (NV_MCTP_MSG_TYPE_VENDOR_DEFINED & NV_MCTP_MSG_TYPE_MASK) + << NV_MCTP_MSG_TYPE_SHIFT; + hdr |= (NV_MCTP_MSG_VENDOR_ID_NVIDIA & NV_MCTP_MSG_VENDOR_ID_MASK) + << NV_MCTP_MSG_VENDOR_ID_SHIFT; + hdr |= (nvdm_type & NV_MCTP_MSG_NVDM_TYPE_MASK) + << NV_MCTP_MSG_NVDM_TYPE_SHIFT; + + return hdr; +} + +static inline u8 mctp_msg_header_get_nvdm_type(u32 hdr) +{ + return (hdr >> NV_MCTP_MSG_NVDM_TYPE_SHIFT) & NV_MCTP_MSG_NVDM_TYPE_MASK; +} + +static int fsp_rpc_send_cmd(struct vfio_pci_device *device, u8 nvdm_type, + const u32 *data, u32 data_count, u32 timeout_ms) +{ + struct gpu_device *gpu = to_nv_gpu(device); + u32 max_packet_words = NV_FSP_RPC_MAX_PACKET_SIZE / 4; + u32 packet[256]; + u32 resp_buf[256]; + u32 total_words; + int resp_words; + u8 resp_nvdm_type; + int ret; + + total_words = 2 + data_count; + if (total_words > max_packet_words) + return -EINVAL; + + packet[0] = mctp_header_build(0, 0, true, true); + packet[1] = mctp_msg_header_build(nvdm_type); + + if (data_count > 0) + memcpy(&packet[2], data, data_count * sizeof(u32)); + + ret = fsp_rpc_send_data(gpu, packet, total_words); + if (ret) + return ret; + + resp_words = fsp_rpc_receive_data(gpu, resp_buf, 256, timeout_ms); + if (resp_words < 0) + return resp_words; + + if (resp_words < NV_FSP_RPC_MIN_RESPONSE_WORDS) + return -EPROTO; + + resp_nvdm_type = mctp_msg_header_get_nvdm_type(resp_buf[1]); + if (resp_nvdm_type != NV_NVDM_TYPE_RESPONSE) + return -EPROTO; + + if (resp_buf[3] != nvdm_type) + return -EPROTO; + + if (resp_buf[4] != 0) + return -resp_buf[4]; + + return 0; +} + +static void fsp_init(struct vfio_pci_device *device) +{ + gpu_poll_register(device, "fsp_boot_complete", NV_FSP_BOOT_COMPLETE_OFFSET, + NV_FSP_BOOT_COMPLETE_MASK, 0xffffffff, 5000); + fsp_reset_rpc_state(device); +} + +static int fsp_fbdma_enable(struct vfio_pci_device *device) +{ + struct gpu_device *gpu = to_nv_gpu(device); + u32 cmd_data = NV_FBDMA_SUBCMD_ENABLE; + int ret; + + if (gpu->fsp_dma_enabled) + return 0; + + ret = fsp_rpc_send_cmd(device, NV_NVDM_TYPE_FBDMA, &cmd_data, 1, 5000); + if (ret < 0) + return ret; + + gpu->fsp_dma_enabled = true; + return 0; +} + +static bool fsp_check_ofa_dma_support(struct vfio_pci_device *device) +{ + struct gpu_device *gpu = to_nv_gpu(device); + u32 val = gpu_read32(gpu, NV_OFA_DMA_SUPPORT_CHECK_REG); + + return (val >> 16) != 0xbadf; +} + +static int size_to_dma_encoding(u32 size) +{ + if (size < 4 || size > 256 || (size & (size - 1))) + return -1; + + return ffs(size) - 3; +} + +static void gpu_enable_bus_master(struct vfio_pci_device *device) +{ + u16 cmd; + + cmd = vfio_pci_config_readw(device, PCI_COMMAND); + vfio_pci_config_writew(device, PCI_COMMAND, cmd | PCI_COMMAND_MASTER); +} + +static void gpu_disable_bus_master(struct vfio_pci_device *device) +{ + u16 cmd; + + cmd = vfio_pci_config_readw(device, PCI_COMMAND); + vfio_pci_config_writew(device, PCI_COMMAND, cmd & ~PCI_COMMAND_MASTER); +} + +static void falcon_dmem_port_configure(struct vfio_pci_device *device, + u32 offset, bool auto_inc_read, + bool auto_inc_write) +{ + struct gpu_device *gpu = to_nv_gpu(device); + const struct falcon *falcon = gpu->falcon; + u32 memc_value = offset; + + /* Set auto-increment flags */ + if (auto_inc_read) + memc_value |= NV_PPWR_FALCON_DMEMC_AINCR_TRUE; + if (auto_inc_write) + memc_value |= NV_PPWR_FALCON_DMEMC_AINCW_TRUE; + + gpu_write32(gpu, falcon->dmem_control_reg, memc_value); +} + +static void falcon_select_core_falcon(struct vfio_pci_device *device) +{ + struct gpu_device *gpu = to_nv_gpu(device); + const struct falcon *falcon = gpu->falcon; + u32 core_select_reg = falcon->base_page + NV_FALCON_CORE_SELECT_OFFSET; + u32 core_select; + + /* Read current value */ + core_select = gpu_read32(gpu, core_select_reg); + + /* Clear bits 4:5 to select falcon core (not RISCV) */ + core_select &= ~NV_FALCON_CORE_SELECT_MASK; + + gpu_write32(gpu, core_select_reg, core_select); +} + +static void falcon_enable(struct vfio_pci_device *device) +{ + struct gpu_device *gpu = to_nv_gpu(device); + const struct falcon *falcon = gpu->falcon; + u32 mailbox_test_reg; + u32 mailbox_val; + + /* Ada-specific: Check if falcon needs reset before enable */ + if (gpu->arch == GPU_ARCH_ADA) { + mailbox_test_reg = falcon->base_page + NV_FALCON_MAILBOX_TEST_OFFSET; + mailbox_val = gpu_read32(gpu, mailbox_test_reg); + if (mailbox_val == NV_FALCON_MAILBOX_RESET_MAGIC) + gpu_write32(gpu, falcon->engine_reset, 1); + } + + /* Enable the falcon based on control method */ + if (!falcon->no_outside_reset) { + if (gpu->pmc_enable_mask != 0) { + u32 pmc_enable; + + /* Enable via PMC_ENABLE register */ + pmc_enable = gpu_read32(gpu, NV_PMC_ENABLE); + gpu_write32(gpu, NV_PMC_ENABLE, pmc_enable | gpu->pmc_enable_mask); + } else { + /* Enable by deasserting engine reset */ + gpu_write32(gpu, falcon->engine_reset, 0); + } + } + + if (gpu->arch < GPU_ARCH_HOPPER) { + falcon_select_core_falcon(device); + + /* Wait for DMACTL to be ready (bits 1:2 should be 0) */ + gpu_poll_register(device, "falcon_dmactl", falcon->dmactl, + 0, NV_FALCON_DMACTL_READY_MASK, 1000); + } +} + +static void falcon_disable(struct vfio_pci_device *device) +{ + struct gpu_device *gpu = to_nv_gpu(device); + const struct falcon *falcon = gpu->falcon; + u32 pmc_enable; + + if (falcon->no_outside_reset) + return; + + if (gpu->pmc_enable_mask != 0) { + /* Disable via PMC_ENABLE */ + pmc_enable = gpu_read32(gpu, NV_PMC_ENABLE); + gpu_write32(gpu, NV_PMC_ENABLE, pmc_enable & ~gpu->pmc_enable_mask); + } else { + /* Disable by asserting engine reset */ + gpu_write32(gpu, falcon->engine_reset, 1); + } +} + +static void falcon_reset(struct vfio_pci_device *device) +{ + falcon_disable(device); + + falcon_enable(device); +} + +static int nv_gpu_falcon_dma_init(struct vfio_pci_device *device) +{ + struct gpu_device *gpu = to_nv_gpu(device); + const struct falcon *falcon; + u32 transcfg; + u32 dmactl; + u32 ctl; + int ret; + + if (!gpu) { + dev_err(device, "GPU device not initialized\n"); + return -EINVAL; + } + + falcon = gpu->falcon; + + gpu_enable_bus_master(device); + + if (gpu->arch >= GPU_ARCH_HOPPER) { + fsp_init(device); + + ret = fsp_fbdma_enable(device); + if (ret) { + dev_err(device, "Failed to enable FSP FBDMA: %d\n", ret); + return ret; + } + + if (!fsp_check_ofa_dma_support(device)) { + dev_err(device, "OFA DMA not supported with current firmware\n"); + return -ENOTSUP; + } + } + + if (gpu->is_memory_clear_supported) { + /* For Turing+, wait for boot to complete first */ + if (gpu->arch >= GPU_ARCH_TURING) { + /* Wait for boot complete - Hopper+ uses FSP register */ + if (gpu->arch >= GPU_ARCH_HOPPER) { + gpu_poll_register(device, "fsp_boot_complete", + NV_FSP_BOOT_COMPLETE_OFFSET, + NV_FSP_BOOT_COMPLETE_MASK, 0xffffffff, 5000); + } else { + gpu_poll_register(device, "boot_complete", + NV_BOOT_COMPLETE_OFFSET, + NV_BOOT_COMPLETE_MASK, 0xffffffff, 5000); + } + gpu_poll_register(device, "memory_clear_finished", + NV_MEM_CLEAR_OFFSET, 0x1, 0xffffffff, 5000); + } + } + + if (!falcon->no_outside_reset) + falcon_reset(device); + + falcon_dmem_port_configure(device, 0, false, false); + + transcfg = gpu_read32(gpu, falcon->fbif_transcfg); + transcfg &= ~NV_FBIF_TRANSCFG_TARGET_MASK; + transcfg |= NV_FBIF_TRANSCFG_SYSMEM_DEFAULT; + gpu_write32(gpu, falcon->fbif_transcfg, transcfg); + + gpu_write32(gpu, falcon->fbif_ctl2, 0x1); + + ctl = gpu_read32(gpu, falcon->fbif_ctl); + ctl |= NV_FBIF_CTL_ALLOW_PHYS_MODE | NV_FBIF_CTL_ALLOW_FULL_PHYS_MODE; + gpu_write32(gpu, falcon->fbif_ctl, ctl); + + dmactl = gpu_read32(gpu, falcon->dmactl); + dmactl &= ~NV_FALCON_DMACTL_DMEM_SCRUBBING; + gpu_write32(gpu, falcon->dmactl, dmactl); + + return 0; +} + +static int nv_gpu_falcon_dma(struct vfio_pci_device *device, + u64 address, + u32 size_encoding, + bool write) +{ + struct gpu_device *gpu = to_nv_gpu(device); + const struct falcon *falcon = gpu->falcon; + u32 dma_cmd; + int ret; + + gpu_write32(gpu, NV_GPU_DMA_ADDR_TOP_BITS_REG, + (address >> 47) & 0xffffffff); + gpu_write32(gpu, falcon->base_page + NV_FALCON_DMA_ADDR_HIGH_OFFSET, + (address >> 40) & 0xff); + gpu_write32(gpu, falcon->base_page + NV_FALCON_DMA_ADDR_LOW_OFFSET, + (address >> 8) & 0xffffffff); + gpu_write32(gpu, falcon->base_page + NV_FALCON_DMA_BLOCK_OFFSET, + address & 0xff); + gpu_write32(gpu, falcon->base_page + NV_FALCON_DMA_MEM_OFFSET, 0); + + dma_cmd = (size_encoding << NV_FALCON_DMA_CMD_SIZE_SHIFT); + + /* Set direction: write (DMEM->mem) or read (mem->DMEM) */ + if (write) + dma_cmd |= NV_FALCON_DMA_CMD_WRITE_BIT; + + gpu_write32(gpu, falcon->base_page + NV_FALCON_DMA_CMD_OFFSET, dma_cmd); + + ret = gpu_poll_register(device, "dma_done", + falcon->base_page + NV_FALCON_DMA_CMD_OFFSET, + NV_FALCON_DMA_CMD_DONE_BIT, NV_FALCON_DMA_CMD_DONE_BIT, + 1000); + if (ret) + return ret; + + return 0; +} + +static int nv_gpu_memcpy_chunk(struct vfio_pci_device *device, + iova_t src, + iova_t dst, + u32 size_encoding) +{ + int ret; + + ret = nv_gpu_falcon_dma(device, src, size_encoding, false); + if (ret) { + dev_err(device, "Failed to queue DMA read (src=0x%llx, size=%u)\n", + (unsigned long long)src, size_encoding); + return ret; + } + + ret = nv_gpu_falcon_dma(device, dst, size_encoding, true); + if (ret) { + dev_err(device, "Failed to queue DMA write (dst=0x%llx, size=%u)\n", + (unsigned long long)dst, size_encoding); + return ret; + } + + return 0; +} + +static int nv_gpu_probe(struct vfio_pci_device *device) +{ + enum gpu_arch gpu_arch; + u32 pmc_boot_0; + void *bar0; + int i; + + if (vfio_pci_config_readw(device, PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) + return -ENODEV; + + if (vfio_pci_config_readw(device, PCI_CLASS_DEVICE) >> 8 != + PCI_BASE_CLASS_DISPLAY) + return -ENODEV; + + /* Get BAR0 pointer for reading GPU registers */ + bar0 = device->bars[0].vaddr; + if (!bar0) + return -ENODEV; + + /* Read PMC_BOOT_0 register from BAR0 to identify GPU */ + pmc_boot_0 = readl(bar0 + NV_PMC_BOOT_0); + + /* Look up GPU architecture to verify this is a supported GPU */ + gpu_arch = nv_gpu_arch_lookup(pmc_boot_0); + if (gpu_arch == GPU_ARCH_UNKNOWN) { + dev_err(device, "Unsupported GPU architecture for PMC_BOOT_0: 0x%x\n", + pmc_boot_0); + return -ENODEV; + } + + /* Check verified GPU map */ + for (i = 0; i < VERIFIED_GPU_MAP_SIZE; i++) { + if (verified_gpu_map[i] == pmc_boot_0) + return 0; + } + + dev_info(device, "Unvalidated GPU: PMC_BOOT_0: 0x%x, possibly not supported\n", + pmc_boot_0); + + return 0; +} + +static void nv_gpu_init(struct vfio_pci_device *device) +{ + struct gpu_device *gpu = to_nv_gpu(device); + const struct gpu_properties *props; + enum gpu_arch gpu_arch; + u32 pmc_boot_0; + int ret; + + /* Get GPU state from DMA-accessible region */ + VFIO_ASSERT_GE(device->driver.region.size, sizeof(*gpu)); + + /* Read PMC_BOOT_0 register from BAR0 to identify GPU */ + pmc_boot_0 = readl(device->bars[0].vaddr + NV_PMC_BOOT_0); + + /* Look up GPU architecture */ + gpu_arch = nv_gpu_arch_lookup(pmc_boot_0); + if (gpu_arch == GPU_ARCH_UNKNOWN) { + dev_err(device, "Unsupported GPU architecture\n"); + return; + } + + props = &gpu_properties_map[gpu_arch]; + + /* Populate GPU structure */ + gpu->arch = gpu_arch; + gpu->bar0 = device->bars[0].vaddr; + gpu->is_memory_clear_supported = props->memory_clear_supported; + gpu->falcon = &falcon_map[props->falcon_type]; + gpu->pmc_enable_mask = props->pmc_enable_mask; + + falcon_enable(device); + + /* Initialize falcon for DMA */ + ret = nv_gpu_falcon_dma_init(device); + if (ret) { + dev_err(device, "Failed to initialize falcon DMA: %d\n", ret); + return; + } + + /* Set DMA transfer limits to comply with falcon DMA constraints */ + device->driver.max_memcpy_size = NV_FALCON_DMA_MAX_TRANSFER_SIZE; + device->driver.max_memcpy_count = NV_FALCON_DMA_MAX_TRANSFER_COUNT; +} + +static void nv_gpu_remove(struct vfio_pci_device *device) +{ + falcon_disable(device); + gpu_disable_bus_master(device); +} + +static void nv_gpu_memcpy_start(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, u64 count) +{ + /* + * This memcpy implementation is synchronous, meaning it performs the memory + * copy operation in a blocking fashion. The copy is submitted, and the call + * will not return until the entire requested buffer has been copied. + * If multiple chunks are needed, each chunk is copied sequentially. + * + * Note: nv_gpu_memcpy_wait() can be used as a synchronization point + * for chunked or asynchronous implementations if ever needed. + */ + u64 iteration; + u64 offset; + int ret; + + /* Perform the copy operation in chunks, repeated 'count' times */ + for (iteration = 0; iteration < count; iteration++) { + offset = 0; + + while (offset < size) { + int chunk_encoding; + u64 remaining = size - offset; + + if (remaining >= NV_FALCON_DMA_MAX_TRANSFER_SIZE) + chunk_encoding = size_to_dma_encoding(NV_FALCON_DMA_MAX_TRANSFER_SIZE); + else + chunk_encoding = size_to_dma_encoding(remaining); + + if (chunk_encoding < 0) { + dev_err(device, "Invalid chunk encoding: %d\n", chunk_encoding); + return; + } + + ret = nv_gpu_memcpy_chunk(device, + src + offset, + dst + offset, + chunk_encoding); + if (ret) { + dev_err(device, "Failed to queue chunk at offset %llu: %d\n", + (unsigned long long)offset, ret); + return; + } + + offset += 0x4 << chunk_encoding; + } + } +} + +static int nv_gpu_memcpy_wait(struct vfio_pci_device *device) +{ + struct gpu_device *gpu = to_nv_gpu(device); + const struct falcon *falcon = gpu->falcon; + int ret; + + ret = gpu_poll_register(device, "dma_write_done", + falcon->base_page + NV_FALCON_DMA_CMD_OFFSET, + NV_FALCON_DMA_CMD_DONE_BIT, NV_FALCON_DMA_CMD_DONE_BIT, + 1000); + if (ret) + return ret; + + return 0; +} + +const struct vfio_pci_driver_ops nv_falcon_ops = { + .name = "nv_falcon", + .probe = nv_gpu_probe, + .init = nv_gpu_init, + .remove = nv_gpu_remove, + .memcpy_start = nv_gpu_memcpy_start, + .memcpy_wait = nv_gpu_memcpy_wait, +}; diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index 9f47bceed16f..ae690a248a1f 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -14,6 +14,8 @@ LIBVFIO_C += drivers/ioat/ioat.c LIBVFIO_C += drivers/dsa/dsa.c endif +LIBVFIO_C += drivers/nv_falcons/nv_falcons.c + LIBVFIO_OUTPUT := $(OUTPUT)/libvfio LIBVFIO_O := $(patsubst %.c, $(LIBVFIO_OUTPUT)/%.o, $(LIBVFIO_C)) diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c index 6827f4a6febe..5c377cfd7deb 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c @@ -7,11 +7,14 @@ extern struct vfio_pci_driver_ops dsa_ops; extern struct vfio_pci_driver_ops ioat_ops; #endif +extern struct vfio_pci_driver_ops nv_falcon_ops; + static struct vfio_pci_driver_ops *driver_ops[] = { #ifdef __x86_64__ &dsa_ops, &ioat_ops, #endif + &nv_falcon_ops, }; void vfio_pci_driver_probe(struct vfio_pci_device *device) -- 2.43.0

