(mahout) 34/50: [QDP] Add launch_amplitude_encode_f32 function (#739)

guanmingchiu Tue, 06 Jan 2026 08:45:36 -0800

This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


commit c79f06a3b0270ae47a4d9b68fd111e79831f2d6e
Author: Ping <[email protected]>
AuthorDate: Thu Dec 25 19:51:10 2025 +0800

    [QDP] Add launch_amplitude_encode_f32 function (#739)
    
    * float32 follow up
    
    Signed-off-by: 400Ping <[email protected]>
    
    * update
    
    Signed-off-by: 400Ping <[email protected]>
    
    ---------
    
    Signed-off-by: 400Ping <[email protected]>
---
 qdp/qdp-kernels/src/amplitude.cu          | 67 ++++++++++++++++++++++++++++---
 qdp/qdp-kernels/src/lib.rs                | 27 +++++++++++++
 qdp/qdp-kernels/tests/amplitude_encode.rs | 59 ++++++++++++++++++++++++++-
 3 files changed, 147 insertions(+), 6 deletions(-)

diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index ea5fc27f7..7cf94ce92 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -41,18 +41,17 @@ __global__ void amplitude_encode_kernel(
 
     // Vectorized Load Optimization:
     // If we are well within bounds, treat input as double2 to issue a single 
128-bit load instruction.
-    // This reduces memory transactions and improves throughput on RTX cards.
+    // Use __ldg() to pull through the read-only cache; cudaMalloc aligns to 
256 bytes so the
+    // reinterpret_cast<double2*> load is naturally aligned.
     if (state_idx_base + 1 < input_len) {
         // Reinterpret cast to load two doubles at once
-        // Note: Assumes input is reasonably aligned (standard cudaMalloc 
provides 256-byte alignment)
-        const double2* input_vec = reinterpret_cast<const double2*>(input);
-        double2 loaded = input_vec[idx];
+        const double2 loaded = __ldg(reinterpret_cast<const double2*>(input) + 
idx);
         v1 = loaded.x;
         v2 = loaded.y;
     }
     // Handle edge case: Odd input length
     else if (state_idx_base < input_len) {
-        v1 = input[state_idx_base];
+        v1 = __ldg(input + state_idx_base);
         // v2 remains 0.0
     }
 
@@ -66,6 +65,35 @@ __global__ void amplitude_encode_kernel(
     }
 }
 
+__global__ void amplitude_encode_kernel_f32(
+    const float* __restrict__ input,
+    cuComplex* __restrict__ state,
+    size_t input_len,
+    size_t state_len,
+    float inv_norm
+) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t state_idx_base = idx * 2;
+    if (state_idx_base >= state_len) return;
+
+    float v1 = 0.0f;
+    float v2 = 0.0f;
+
+    if (state_idx_base + 1 < input_len) {
+        // Mirror the double kernel: cached vectorized load for two floats
+        const float2 loaded = __ldg(reinterpret_cast<const float2*>(input) + 
idx);
+        v1 = loaded.x;
+        v2 = loaded.y;
+    } else if (state_idx_base < input_len) {
+        v1 = __ldg(input + state_idx_base);
+    }
+
+    state[state_idx_base] = make_cuComplex(v1 * inv_norm, 0.0f);
+    if (state_idx_base + 1 < state_len) {
+        state[state_idx_base + 1] = make_cuComplex(v2 * inv_norm, 0.0f);
+    }
+}
+
 // Warp-level reduction for sum using shuffle instructions
 __device__ __forceinline__ double warp_reduce_sum(double val) {
     for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
@@ -137,6 +165,35 @@ int launch_amplitude_encode(
     return (int)cudaGetLastError();
 }
 
+/// Launch amplitude encoding kernel for float32
+int launch_amplitude_encode_f32(
+    const float* input_d,
+    void* state_d,
+    size_t input_len,
+    size_t state_len,
+    float inv_norm,
+    cudaStream_t stream
+) {
+    if (inv_norm <= 0.0f || !isfinite(inv_norm)) {
+        return cudaErrorInvalidValue;
+    }
+
+    cuComplex* state_complex_d = static_cast<cuComplex*>(state_d);
+
+    const int blockSize = 256;
+    const int gridSize = (state_len / 2 + blockSize - 1) / blockSize;
+
+    amplitude_encode_kernel_f32<<<gridSize, blockSize, 0, stream>>>(
+        input_d,
+        state_complex_d,
+        input_len,
+        state_len,
+        inv_norm
+    );
+
+    return (int)cudaGetLastError();
+}
+
 /// Optimized batch amplitude encoding kernel
 ///
 /// Memory Layout (row-major):
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index bae8782ef..d9fc0a163 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -69,6 +69,20 @@ unsafe extern "C" {
         stream: *mut c_void,
     ) -> i32;
 
+    /// Launch amplitude encoding kernel (float32 input/output)
+    /// Returns CUDA error code (0 = success)
+    ///
+    /// # Safety
+    /// Requires valid GPU pointers, must sync before freeing
+    pub fn launch_amplitude_encode_f32(
+        input_d: *const f32,
+        state_d: *mut c_void,
+        input_len: usize,
+        state_len: usize,
+        inv_norm: f32,
+        stream: *mut c_void,
+    ) -> i32;
+
     /// Launch batch amplitude encoding kernel
     /// Returns CUDA error code (0 = success)
     ///
@@ -138,6 +152,19 @@ pub extern "C" fn launch_amplitude_encode(
     999 // Error: CUDA unavailable
 }
 
+#[cfg(not(target_os = "linux"))]
+#[unsafe(no_mangle)]
+pub extern "C" fn launch_amplitude_encode_f32(
+    _input_d: *const f32,
+    _state_d: *mut c_void,
+    _input_len: usize,
+    _state_len: usize,
+    _inv_norm: f32,
+    _stream: *mut c_void,
+) -> i32 {
+    999
+}
+
 #[cfg(not(target_os = "linux"))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_l2_norm(
diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs 
b/qdp/qdp-kernels/tests/amplitude_encode.rs
index e290d550c..4223dd0bb 100644
--- a/qdp/qdp-kernels/tests/amplitude_encode.rs
+++ b/qdp/qdp-kernels/tests/amplitude_encode.rs
@@ -19,9 +19,17 @@
 #[cfg(target_os = "linux")]
 use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
 #[cfg(target_os = "linux")]
-use qdp_kernels::{CuDoubleComplex, launch_amplitude_encode, launch_l2_norm, 
launch_l2_norm_batch};
+use qdp_kernels::{
+    CuComplex,
+    CuDoubleComplex,
+    launch_amplitude_encode,
+    launch_amplitude_encode_f32,
+    launch_l2_norm,
+    launch_l2_norm_batch,
+};
 
 const EPSILON: f64 = 1e-10;
+const EPSILON_F32: f32 = 1e-5;
 
 #[test]
 #[cfg(target_os = "linux")]
@@ -94,6 +102,55 @@ fn test_amplitude_encode_basic() {
     println!("PASS: Basic amplitude encoding works correctly");
 }
 
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_basic_f32() {
+    println!("Testing basic amplitude encoding (float32)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input: Vec<f32> = vec![3.0, 4.0];
+    let norm = (input[0] * input[0] + input[1] * input[1]).sqrt();
+    let inv_norm = 1.0f32 / norm;
+    let state_len = 4usize;
+
+    let input_d = device.htod_copy(input.clone()).unwrap();
+    let mut state_d = device.alloc_zeros::<CuComplex>(state_len).unwrap();
+
+    let result = unsafe {
+        launch_amplitude_encode_f32(
+            *input_d.device_ptr() as *const f32,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            input.len(),
+            state_len,
+            inv_norm,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(result, 0, "Kernel launch should succeed");
+
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+    assert!((state_h[0].x - 0.6).abs() < EPSILON_F32, "First element should be 
0.6");
+    assert!(state_h[0].y.abs() < EPSILON_F32, "First element imaginary should 
be 0");
+    assert!((state_h[1].x - 0.8).abs() < EPSILON_F32, "Second element should 
be 0.8");
+    assert!(state_h[1].y.abs() < EPSILON_F32, "Second element imaginary should 
be 0");
+    assert!(state_h[2].x.abs() < EPSILON_F32, "Third element should be 0");
+    assert!(state_h[3].x.abs() < EPSILON_F32, "Fourth element should be 0");
+
+    let total_prob: f32 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum();
+    assert!((total_prob - 1.0).abs() < EPSILON_F32, "Total probability should 
be 1.0");
+
+    println!("PASS: Basic float32 amplitude encoding works correctly");
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn test_amplitude_encode_power_of_two() {

(mahout) 34/50: [QDP] Add launch_amplitude_encode_f32 function (#739)

Reply via email to