(mahout) branch main updated: [Feature] Make QDP CUDA kernel build targets configurable and future-compatible (#1283)

hcr Mon, 11 May 2026 21:40:26 -0700

This is an automated email from the ASF dual-hosted git repository.

ryankert01 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/main by this push:
     new 3aff27946 [Feature] Make QDP CUDA kernel build targets configurable 
and future-compatible (#1283)
3aff27946 is described below

commit 3aff2794635a2d1ba109b9b3c2ca05ece3d1ba26
Author: Vic Wen <[email protected]>
AuthorDate: Tue May 12 12:40:13 2026 +0800

    [Feature] Make QDP CUDA kernel build targets configurable and 
future-compatible (#1283)
    
    * Make QDP CUDA kernel build targets configurable
    
    * docs(qdp): document CUDA architecture target override
    
    * Revert "docs(qdp): document CUDA architecture target override"
    
    This reverts commit 9062a4bcf5fa38698bb15756d02a9346cb353f77.
    
    * docs(qdp): add supported GPU backends section
---
 docs/qdp/getting-started.md |  17 +++++
 qdp/qdp-kernels/build.rs    | 155 +++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 155 insertions(+), 17 deletions(-)

diff --git a/docs/qdp/getting-started.md b/docs/qdp/getting-started.md
index 4abf17226..3579875f6 100644
--- a/docs/qdp/getting-started.md
+++ b/docs/qdp/getting-started.md
@@ -13,6 +13,23 @@ QDP (Quantum Data Plane) is a GPU-accelerated library for 
encoding classical dat
   - NVIDIA GPU with a CUDA-compatible PyTorch build (verify with `python -c 
"import torch; print(torch.cuda.is_available())"`)
   - AMD GPU with a ROCm-compatible PyTorch build (verify with `python -c 
"import torch; print(torch.version.hip)"`) plus Triton with HIP support
 
+## Supported GPU Backends
+
+QDP currently supports the following GPU backends:
+
+- NVIDIA CUDA backend
+  - CUDA builds target NVIDIA GPUs supported by the installed CUDA toolkit.
+  - The current default CUDA architecture shortlist spans common NVIDIA
+    generations from Turing through Blackwell: `sm_75`, `sm_80`, `sm_86`,
+    `sm_89`, `sm_90`, `sm_100`, and `sm_120`.
+  - The final generated CUDA targets depend on the local `nvcc` supported
+    architectures.
+
+- AMD ROCm backend
+  - ROCm builds target AMD GPUs supported by the installed ROCm stack and the
+    Triton backend used by QDP.
+  - The final supported devices depend on the local ROCm environment.
+
 ## Installation
 
 ```bash
diff --git a/qdp/qdp-kernels/build.rs b/qdp/qdp-kernels/build.rs
index 097b57e15..def59d693 100644
--- a/qdp/qdp-kernels/build.rs
+++ b/qdp/qdp-kernels/build.rs
@@ -26,6 +26,134 @@
 use std::env;
 use std::process::Command;
 
+const DEFAULT_CUBIN_ARCHES: &[&str] = &["75", "80", "86", "89", "90", "100", 
"120"];
+const DEFAULT_PTX_CANDIDATES: &[&str] = &["120", "100", "90", "89", "86", 
"80", "75"];
+const LEGACY_FALLBACK_ARCHES: &[&str] = &["75", "80", "86"];
+
+fn add_sm_target(build: &mut cc::Build, arch: &str) {
+    build.flag("-gencode");
+    build.flag(format!("arch=compute_{arch},code=sm_{arch}"));
+}
+
+fn add_ptx_target(build: &mut cc::Build, arch: &str) {
+    build.flag("-gencode");
+    build.flag(format!("arch=compute_{arch},code=compute_{arch}"));
+}
+
+fn parse_arch_name(raw: &str) -> Result<String, String> {
+    let trimmed = raw.trim();
+    if trimmed.is_empty()
+        || !trimmed
+            .chars()
+            .all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
+        || !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
+    {
+        return Err(format!(
+            "Invalid CUDA architecture '{trimmed}' in QDP_CUDA_ARCH_LIST. 
Expected entries like \
+             '89', '90a', or '120+PTX'."
+        ));
+    }
+
+    Ok(trimmed.to_ascii_lowercase())
+}
+
+fn apply_env_arch_list(build: &mut cc::Build, raw: &str) -> Result<(), String> 
{
+    let mut saw_target = false;
+    for entry in raw.split(',') {
+        let trimmed = entry.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+
+        if let Some(base) = trimmed
+            .strip_suffix("+PTX")
+            .or_else(|| trimmed.strip_suffix("+ptx"))
+        {
+            let arch = parse_arch_name(base)?;
+            add_sm_target(build, &arch);
+            add_ptx_target(build, &arch);
+            saw_target = true;
+            continue;
+        }
+
+        let arch = parse_arch_name(trimmed)?;
+        add_sm_target(build, &arch);
+        saw_target = true;
+    }
+
+    if !saw_target {
+        return Err(
+            "QDP_CUDA_ARCH_LIST did not contain any usable CUDA architectures 
after parsing."
+                .to_string(),
+        );
+    }
+
+    Ok(())
+}
+
+fn query_nvcc_list(flag: &str) -> Vec<String> {
+    let Ok(output) = Command::new("nvcc").arg(flag).output() else {
+        return Vec::new();
+    };
+
+    if !output.status.success() {
+        return Vec::new();
+    }
+
+    String::from_utf8_lossy(&output.stdout)
+        .lines()
+        .filter_map(|line| {
+            line.trim()
+                .strip_prefix("sm_")
+                .or_else(|| line.trim().strip_prefix("compute_"))
+        })
+        .map(|suffix| suffix.to_ascii_lowercase())
+        .collect()
+}
+
+fn nvcc_supports(supported_arches: &[String], arch: &str) -> bool {
+    supported_arches.iter().any(|supported| supported == arch)
+}
+
+fn apply_default_arch_targets(build: &mut cc::Build) {
+    let supported_sm = query_nvcc_list("--list-gpu-code");
+    let supported_compute = query_nvcc_list("--list-gpu-arch");
+
+    if supported_sm.is_empty() && supported_compute.is_empty() {
+        for arch in LEGACY_FALLBACK_ARCHES {
+            add_sm_target(build, arch);
+        }
+        return;
+    }
+
+    let cubin_arches = if supported_sm.is_empty() {
+        &supported_compute
+    } else {
+        &supported_sm
+    };
+    let mut added_cubin = false;
+
+    for arch in DEFAULT_CUBIN_ARCHES {
+        if nvcc_supports(cubin_arches, arch) {
+            add_sm_target(build, arch);
+            added_cubin = true;
+        }
+    }
+
+    if !added_cubin {
+        for arch in LEGACY_FALLBACK_ARCHES {
+            add_sm_target(build, arch);
+        }
+    }
+
+    if let Some(ptx_arch) = DEFAULT_PTX_CANDIDATES
+        .iter()
+        .find(|arch| nvcc_supports(&supported_compute, arch))
+    {
+        add_ptx_target(build, ptx_arch);
+    }
+}
+
 fn main() {
     // Let rustc know about our build-script-defined cfg flags (avoids 
`unexpected_cfgs` warnings).
     println!("cargo::rustc-check-cfg=cfg(qdp_no_cuda)");
@@ -38,6 +166,7 @@ fn main() {
     println!("cargo:rerun-if-changed=src/iqp.cu");
     println!("cargo:rerun-if-changed=src/phase.cu");
     println!("cargo:rerun-if-env-changed=QDP_NO_CUDA");
+    println!("cargo:rerun-if-env-changed=QDP_CUDA_ARCH_LIST");
     println!("cargo:rerun-if-changed=src/kernel_config.h");
 
     // Check if CUDA is available by looking for nvcc
@@ -81,23 +210,15 @@ fn main() {
     build
         .cuda(true)
         .flag("-cudart=shared") // Use shared CUDA runtime
-        .flag("-std=c++17") // C++17 for modern CUDA features
-        // GPU architecture targets
-        // SM 75 = Turing (T4, RTX 2000 series)
-        // SM 80 = Ampere (A100, RTX 3000 series)
-        // SM 86 = Ampere (RTX 3090, A40)
-        // SM 89 = Ada Lovelace (RTX 4000 series)
-        // SM 90 = Hopper (H100)
-        // Support both Turing (sm_75) and Ampere+ architectures
-        .flag("-gencode")
-        .flag("arch=compute_75,code=sm_75")
-        .flag("-gencode")
-        .flag("arch=compute_80,code=sm_80")
-        .flag("-gencode")
-        .flag("arch=compute_86,code=sm_86")
-        // Optional: Add more architectures for production
-        // .flag("-gencode")
-        // .flag("arch=compute_89,code=sm_89")
+        .flag("-std=c++17"); // C++17 for modern CUDA features
+
+    if let Ok(raw) = env::var("QDP_CUDA_ARCH_LIST") {
+        apply_env_arch_list(&mut build, &raw).unwrap_or_else(|message| 
panic!("{message}"));
+    } else {
+        apply_default_arch_targets(&mut build);
+    }
+
+    build
         .file("src/amplitude.cu")
         .file("src/basis.cu")
         .file("src/angle.cu")

(mahout) branch main updated: [Feature] Make QDP CUDA kernel build targets configurable and future-compatible (#1283)

Reply via email to