(mahout) branch main updated: [QDP] Extend CUDA tensor direct GPU encode to angle (#932)

jiekaichang Thu, 29 Jan 2026 03:38:27 -0800

This is an automated email from the ASF dual-hosted git repository.

jiekaichang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/main by this push:
     new 3e401f977 [QDP] Extend CUDA tensor direct GPU encode to angle (#932)
3e401f977 is described below

commit 3e401f9778d41b86f5ce1e644b26f1e5b52f8f76
Author: Jie-Kai Chang <[email protected]>
AuthorDate: Thu Jan 29 19:38:11 2026 +0800

    [QDP] Extend CUDA tensor direct GPU encode to angle (#932)
    
    * Extend CUDA tensor direct GPU encode to angle
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix pre-commit
    
    Signed-off-by: 400Ping <[email protected]>
    
    * update
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix conflicts
    
    Signed-off-by: 400Ping <[email protected]>
    
    ---------
    
    Signed-off-by: 400Ping <[email protected]>
    Signed-off-by: 400Ping <[email protected]>
---
 qdp/qdp-core/src/lib.rs   | 507 ++++++++++++++++++++++++++++++----------------
 qdp/qdp-python/src/lib.rs |   7 +-
 2 files changed, 339 insertions(+), 175 deletions(-)

diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 2b14f6c5b..f6b58d50a 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -319,7 +319,7 @@ impl QdpEngine {
     /// * `input_d` - Device pointer to input data (f64 array on GPU)
     /// * `input_len` - Number of f64 elements in the input
     /// * `num_qubits` - Number of qubits for encoding
-    /// * `encoding_method` - Strategy (currently only "amplitude" supported)
+    /// * `encoding_method` - Strategy (currently "amplitude" and "angle" 
supported)
     ///
     /// # Returns
     /// DLPack pointer for zero-copy PyTorch integration
@@ -339,13 +339,6 @@ impl QdpEngine {
     ) -> Result<*mut DLManagedTensor> {
         crate::profile_scope!("Mahout::EncodeFromGpuPtr");
 
-        if encoding_method != "amplitude" {
-            return Err(MahoutError::NotImplemented(format!(
-                "GPU pointer encoding currently only supports 'amplitude' 
method, got '{}'",
-                encoding_method
-            )));
-        }
-
         if input_len == 0 {
             return Err(MahoutError::InvalidInput(
                 "Input data cannot be empty".into(),
@@ -353,68 +346,127 @@ impl QdpEngine {
         }
 
         let state_len = 1usize << num_qubits;
-        if input_len > state_len {
-            return Err(MahoutError::InvalidInput(format!(
-                "Input size {} exceeds state vector size {} (2^{} qubits)",
-                input_len, state_len, num_qubits
-            )));
-        }
-
-        // Allocate output state vector
-        let state_vector = {
-            crate::profile_scope!("GPU::Alloc");
-            gpu::GpuStateVector::new(&self.device, num_qubits)?
-        };
-
-        // Compute inverse L2 norm on GPU
-        let inv_norm = {
-            crate::profile_scope!("GPU::NormFromPtr");
-            // SAFETY: input_d validity is guaranteed by the caller's safety 
contract
-            unsafe {
-                gpu::AmplitudeEncoder::calculate_inv_norm_gpu(&self.device, 
input_d, input_len)?
+        let method = encoding_method.to_ascii_lowercase();
+
+        match method.as_str() {
+            "amplitude" => {
+                if input_len > state_len {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Input size {} exceeds state vector size {} (2^{} 
qubits)",
+                        input_len, state_len, num_qubits
+                    )));
+                }
+
+                let state_vector = {
+                    crate::profile_scope!("GPU::Alloc");
+                    gpu::GpuStateVector::new(&self.device, num_qubits)?
+                };
+
+                let inv_norm = {
+                    crate::profile_scope!("GPU::NormFromPtr");
+                    // SAFETY: input_d validity is guaranteed by the caller's 
safety contract
+                    unsafe {
+                        gpu::AmplitudeEncoder::calculate_inv_norm_gpu(
+                            &self.device,
+                            input_d,
+                            input_len,
+                        )?
+                    }
+                };
+
+                let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+                    MahoutError::InvalidInput(
+                        "State vector precision mismatch (expected float64 
buffer)".to_string(),
+                    )
+                })?;
+
+                {
+                    crate::profile_scope!("GPU::KernelLaunch");
+                    let ret = unsafe {
+                        qdp_kernels::launch_amplitude_encode(
+                            input_d,
+                            state_ptr as *mut std::ffi::c_void,
+                            input_len,
+                            state_len,
+                            inv_norm,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Amplitude encode kernel failed with CUDA error 
code: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    self.device.synchronize().map_err(|e| {
+                        MahoutError::Cuda(format!("CUDA device synchronize 
failed: {:?}", e))
+                    })?;
+                }
+
+                let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
+                Ok(state_vector.to_dlpack())
             }
-        };
-
-        // Get output pointer
-        let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
-            MahoutError::InvalidInput(
-                "State vector precision mismatch (expected float64 
buffer)".to_string(),
-            )
-        })?;
-
-        // Launch encoding kernel
-        {
-            crate::profile_scope!("GPU::KernelLaunch");
-            let ret = unsafe {
-                qdp_kernels::launch_amplitude_encode(
-                    input_d,
-                    state_ptr as *mut std::ffi::c_void,
-                    input_len,
-                    state_len,
-                    inv_norm,
-                    std::ptr::null_mut(), // default stream
-                )
-            };
-
-            if ret != 0 {
-                return Err(MahoutError::KernelLaunch(format!(
-                    "Amplitude encode kernel failed with CUDA error code: {} 
({})",
-                    ret,
-                    cuda_error_to_string(ret)
-                )));
+            "angle" => {
+                if input_len != num_qubits {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Angle encoding expects {} values (one per qubit), got 
{}",
+                        num_qubits, input_len
+                    )));
+                }
+
+                let state_vector = {
+                    crate::profile_scope!("GPU::Alloc");
+                    gpu::GpuStateVector::new(&self.device, num_qubits)?
+                };
+
+                let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+                    MahoutError::InvalidInput(
+                        "State vector precision mismatch (expected float64 
buffer)".to_string(),
+                    )
+                })?;
+
+                {
+                    crate::profile_scope!("GPU::KernelLaunch");
+                    let ret = unsafe {
+                        qdp_kernels::launch_angle_encode(
+                            input_d,
+                            state_ptr as *mut std::ffi::c_void,
+                            state_len,
+                            num_qubits as u32,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Angle encoding kernel failed with CUDA error 
code: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    self.device.synchronize().map_err(|e| {
+                        MahoutError::Cuda(format!("CUDA device synchronize 
failed: {:?}", e))
+                    })?;
+                }
+
+                let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
+                Ok(state_vector.to_dlpack())
             }
+            _ => Err(MahoutError::NotImplemented(format!(
+                "GPU pointer encoding currently only supports 'amplitude' and 
'angle' methods, got '{}'",
+                encoding_method
+            ))),
         }
-
-        // Synchronize
-        {
-            crate::profile_scope!("GPU::Synchronize");
-            self.device.synchronize().map_err(|e| {
-                MahoutError::Cuda(format!("CUDA device synchronize failed: 
{:?}", e))
-            })?;
-        }
-
-        let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
-        Ok(state_vector.to_dlpack())
     }
 
     /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
@@ -428,7 +480,7 @@ impl QdpEngine {
     /// * `num_samples` - Number of samples in the batch
     /// * `sample_size` - Size of each sample in f64 elements
     /// * `num_qubits` - Number of qubits for encoding
-    /// * `encoding_method` - Strategy (currently only "amplitude" supported)
+    /// * `encoding_method` - Strategy (currently "amplitude" and "angle" 
supported)
     ///
     /// # Returns
     /// Single DLPack pointer containing all encoded states (shape: 
[num_samples, 2^num_qubits])
@@ -449,13 +501,6 @@ impl QdpEngine {
     ) -> Result<*mut DLManagedTensor> {
         crate::profile_scope!("Mahout::EncodeBatchFromGpuPtr");
 
-        if encoding_method != "amplitude" {
-            return Err(MahoutError::NotImplemented(format!(
-                "GPU pointer batch encoding currently only supports 
'amplitude' method, got '{}'",
-                encoding_method
-            )));
-        }
-
         if num_samples == 0 {
             return Err(MahoutError::InvalidInput(
                 "Number of samples cannot be zero".into(),
@@ -469,106 +514,224 @@ impl QdpEngine {
         }
 
         let state_len = 1usize << num_qubits;
-        if sample_size > state_len {
-            return Err(MahoutError::InvalidInput(format!(
-                "Sample size {} exceeds state vector size {} (2^{} qubits)",
-                sample_size, state_len, num_qubits
-            )));
-        }
-
-        // Allocate output state vector
-        let batch_state_vector = {
-            crate::profile_scope!("GPU::AllocBatch");
-            gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
-        };
-
-        // Compute inverse norms on GPU using warp-reduced kernel
-        let inv_norms_gpu = {
-            crate::profile_scope!("GPU::BatchNormKernel");
-            use cudarc::driver::DevicePtrMut;
-
-            let mut buffer = 
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
-                MahoutError::MemoryAllocation(format!("Failed to allocate norm 
buffer: {:?}", e))
-            })?;
-
-            let ret = unsafe {
-                qdp_kernels::launch_l2_norm_batch(
-                    input_batch_d,
-                    num_samples,
-                    sample_size,
-                    *buffer.device_ptr_mut() as *mut f64,
-                    std::ptr::null_mut(), // default stream
-                )
-            };
-
-            if ret != 0 {
-                return Err(MahoutError::KernelLaunch(format!(
-                    "Norm reduction kernel failed with CUDA error code: {} 
({})",
-                    ret,
-                    cuda_error_to_string(ret)
-                )));
-            }
-
-            buffer
-        };
-
-        // Validate norms on host to catch zero or NaN samples early
-        {
-            crate::profile_scope!("GPU::NormValidation");
-            let host_inv_norms = self
-                .device
-                .dtoh_sync_copy(&inv_norms_gpu)
-                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms 
to host: {:?}", e)))?;
-
-            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
-                return Err(MahoutError::InvalidInput(
-                    "One or more samples have zero or invalid 
norm".to_string(),
-                ));
+        let method = encoding_method.to_ascii_lowercase();
+
+        match method.as_str() {
+            "amplitude" => {
+                if sample_size > state_len {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Sample size {} exceeds state vector size {} (2^{} 
qubits)",
+                        sample_size, state_len, num_qubits
+                    )));
+                }
+
+                let batch_state_vector = {
+                    crate::profile_scope!("GPU::AllocBatch");
+                    gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
+                };
+
+                let inv_norms_gpu = {
+                    crate::profile_scope!("GPU::BatchNormKernel");
+                    use cudarc::driver::DevicePtrMut;
+
+                    let mut buffer = 
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+                        MahoutError::MemoryAllocation(format!(
+                            "Failed to allocate norm buffer: {:?}",
+                            e
+                        ))
+                    })?;
+
+                    let ret = unsafe {
+                        qdp_kernels::launch_l2_norm_batch(
+                            input_batch_d,
+                            num_samples,
+                            sample_size,
+                            *buffer.device_ptr_mut() as *mut f64,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Norm reduction kernel failed with CUDA error 
code: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+
+                    buffer
+                };
+
+                {
+                    crate::profile_scope!("GPU::NormValidation");
+                    let host_inv_norms =
+                        self.device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| 
{
+                            MahoutError::Cuda(format!("Failed to copy norms to 
host: {:?}", e))
+                        })?;
+
+                    if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 
0.0) {
+                        return Err(MahoutError::InvalidInput(
+                            "One or more samples have zero or invalid 
norm".to_string(),
+                        ));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::BatchKernelLaunch");
+                    use cudarc::driver::DevicePtr;
+
+                    let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| 
{
+                        MahoutError::InvalidInput(
+                            "Batch state vector precision mismatch (expected 
float64 buffer)"
+                                .to_string(),
+                        )
+                    })?;
+
+                    let ret = unsafe {
+                        qdp_kernels::launch_amplitude_encode_batch(
+                            input_batch_d,
+                            state_ptr as *mut std::ffi::c_void,
+                            *inv_norms_gpu.device_ptr() as *const f64,
+                            num_samples,
+                            sample_size,
+                            state_len,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Batch kernel launch failed with CUDA error code: 
{} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    self.device
+                        .synchronize()
+                        .map_err(|e| MahoutError::Cuda(format!("Sync failed: 
{:?}", e)))?;
+                }
+
+                let batch_state_vector =
+                    batch_state_vector.to_precision(&self.device, 
self.precision)?;
+                Ok(batch_state_vector.to_dlpack())
             }
-        }
-
-        // Launch batch kernel
-        {
-            crate::profile_scope!("GPU::BatchKernelLaunch");
-            use cudarc::driver::DevicePtr;
-
-            let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
-                MahoutError::InvalidInput(
-                    "Batch state vector precision mismatch (expected float64 
buffer)".to_string(),
-                )
-            })?;
-
-            let ret = unsafe {
-                qdp_kernels::launch_amplitude_encode_batch(
-                    input_batch_d,
-                    state_ptr as *mut std::ffi::c_void,
-                    *inv_norms_gpu.device_ptr() as *const f64,
-                    num_samples,
-                    sample_size,
-                    state_len,
-                    std::ptr::null_mut(), // default stream
-                )
-            };
-
-            if ret != 0 {
-                return Err(MahoutError::KernelLaunch(format!(
-                    "Batch kernel launch failed with CUDA error code: {} ({})",
-                    ret,
-                    cuda_error_to_string(ret)
-                )));
+            "angle" => {
+                use cudarc::driver::DevicePtrMut;
+
+                if sample_size != num_qubits {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Angle encoding expects sample_size={} (one angle per 
qubit), got {}",
+                        num_qubits, sample_size
+                    )));
+                }
+
+                // Validate that all input angles are finite (no NaN/Inf), 
consistent with
+                // CPU and host-side batch angle encoding paths.
+                let angle_validation_buffer = {
+                    crate::profile_scope!("GPU::AngleFiniteCheckBatch");
+
+                    let mut buffer = 
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+                        MahoutError::MemoryAllocation(format!(
+                            "Failed to allocate angle validation buffer: {:?}",
+                            e
+                        ))
+                    })?;
+
+                    let ret = unsafe {
+                        qdp_kernels::launch_l2_norm_batch(
+                            input_batch_d,
+                            num_samples,
+                            sample_size,
+                            *buffer.device_ptr_mut() as *mut f64,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Angle validation norm kernel failed with CUDA 
error code: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+
+                    buffer
+                };
+
+                {
+                    
crate::profile_scope!("GPU::AngleFiniteValidationHostCopy");
+                    let host_norms = self
+                        .device
+                        .dtoh_sync_copy(&angle_validation_buffer)
+                        .map_err(|e| {
+                            MahoutError::Cuda(format!(
+                                "Failed to copy angle validation norms to 
host: {:?}",
+                                e
+                            ))
+                        })?;
+
+                    if host_norms.iter().any(|v| !v.is_finite()) {
+                        return Err(MahoutError::InvalidInput(
+                            "Angle encoding batch contains non-finite values 
(NaN or Inf)"
+                                .to_string(),
+                        ));
+                    }
+                }
+
+                let batch_state_vector = {
+                    crate::profile_scope!("GPU::AllocBatch");
+                    gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
+                };
+
+                let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+                    MahoutError::InvalidInput(
+                        "Batch state vector precision mismatch (expected 
float64 buffer)"
+                            .to_string(),
+                    )
+                })?;
+
+                {
+                    crate::profile_scope!("GPU::BatchKernelLaunch");
+                    let ret = unsafe {
+                        qdp_kernels::launch_angle_encode_batch(
+                            input_batch_d,
+                            state_ptr as *mut std::ffi::c_void,
+                            num_samples,
+                            state_len,
+                            num_qubits as u32,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Batch angle encoding kernel failed: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    self.device
+                        .synchronize()
+                        .map_err(|e| MahoutError::Cuda(format!("Sync failed: 
{:?}", e)))?;
+                }
+
+                let batch_state_vector =
+                    batch_state_vector.to_precision(&self.device, 
self.precision)?;
+                Ok(batch_state_vector.to_dlpack())
             }
+            _ => Err(MahoutError::NotImplemented(format!(
+                "GPU pointer batch encoding currently only supports 
'amplitude' and 'angle' methods, got '{}'",
+                encoding_method
+            ))),
         }
-
-        // Synchronize
-        {
-            crate::profile_scope!("GPU::Synchronize");
-            self.device
-                .synchronize()
-                .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", 
e)))?;
-        }
-
-        let batch_state_vector = batch_state_vector.to_precision(&self.device, 
self.precision)?;
-        Ok(batch_state_vector.to_dlpack())
     }
 }
 
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs
index 0df6cedc6..8647d98b7 100644
--- a/qdp/qdp-python/src/lib.rs
+++ b/qdp/qdp-python/src/lib.rs
@@ -218,10 +218,11 @@ fn validate_cuda_tensor_for_encoding(
     expected_device_id: usize,
     encoding_method: &str,
 ) -> PyResult<()> {
-    // Check encoding method support (currently only amplitude is supported 
for CUDA tensors)
-    if encoding_method != "amplitude" {
+    let method = encoding_method.to_ascii_lowercase();
+    // Check encoding method support (currently amplitude and angle are 
supported for CUDA tensors)
+    if method != "amplitude" && method != "angle" {
         return Err(PyRuntimeError::new_err(format!(
-            "CUDA tensor encoding currently only supports 'amplitude' method, 
got '{}'. \
+            "CUDA tensor encoding currently only supports 'amplitude' and 
'angle' methods, got '{}'. \
              Use tensor.cpu() to convert to CPU tensor for other encoding 
methods.",
             encoding_method
         )));

(mahout) branch main updated: [QDP] Extend CUDA tensor direct GPU encode to angle (#932)

Reply via email to