Re: [PR] [QDP] Extend CUDA tensor direct GPU encode to angle [mahout]

via GitHub Mon, 26 Jan 2026 14:30:48 -0800


Copilot commented on code in PR #932:
URL: https://github.com/apache/mahout/pull/932#discussion_r2729509174



##########
qdp/qdp-core/src/lib.rs:
##########
@@ -339,82 +339,134 @@ impl QdpEngine {
     ) -> Result<*mut DLManagedTensor> {
         crate::profile_scope!("Mahout::EncodeFromGpuPtr");
 
-        if encoding_method != "amplitude" {
-            return Err(MahoutError::NotImplemented(format!(
-                "GPU pointer encoding currently only supports 'amplitude' 
method, got '{}'",
-                encoding_method
-            )));
-        }
-
         if input_len == 0 {
             return Err(MahoutError::InvalidInput(
                 "Input data cannot be empty".into(),
             ));
         }
 
         let state_len = 1usize << num_qubits;
-        if input_len > state_len {
-            return Err(MahoutError::InvalidInput(format!(
-                "Input size {} exceeds state vector size {} (2^{} qubits)",
-                input_len, state_len, num_qubits
-            )));
-        }
-
-        // Allocate output state vector
-        let state_vector = {
-            crate::profile_scope!("GPU::Alloc");
-            gpu::GpuStateVector::new(&self.device, num_qubits)?
-        };
-
-        // Compute inverse L2 norm on GPU
-        let inv_norm = {
-            crate::profile_scope!("GPU::NormFromPtr");
-            // SAFETY: input_d validity is guaranteed by the caller's safety 
contract
-            unsafe {
-                gpu::AmplitudeEncoder::calculate_inv_norm_gpu(&self.device, 
input_d, input_len)?
+        let method = encoding_method.to_ascii_lowercase();
+
+        match method.as_str() {
+            "amplitude" => {
+                if input_len > state_len {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Input size {} exceeds state vector size {} (2^{} 
qubits)",
+                        input_len, state_len, num_qubits
+                    )));
+                }
+
+                let state_vector = {
+                    crate::profile_scope!("GPU::Alloc");
+                    gpu::GpuStateVector::new(&self.device, num_qubits)?
+                };
+
+                let inv_norm = {
+                    crate::profile_scope!("GPU::NormFromPtr");
+                    // SAFETY: input_d validity is guaranteed by the caller's 
safety contract
+                    unsafe {
+                        gpu::AmplitudeEncoder::calculate_inv_norm_gpu(
+                            &self.device,
+                            input_d,
+                            input_len,
+                        )?
+                    }
+                };
+
+                let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+                    MahoutError::InvalidInput(
+                        "State vector precision mismatch (expected float64 
buffer)".to_string(),
+                    )
+                })?;
+
+                {
+                    crate::profile_scope!("GPU::KernelLaunch");
+                    let ret = unsafe {
+                        qdp_kernels::launch_amplitude_encode(
+                            input_d,
+                            state_ptr as *mut std::ffi::c_void,
+                            input_len,
+                            state_len,
+                            inv_norm,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Amplitude encode kernel failed with CUDA error 
code: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    self.device.synchronize().map_err(|e| {
+                        MahoutError::Cuda(format!("CUDA device synchronize 
failed: {:?}", e))
+                    })?;
+                }
+
+                let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
+                Ok(state_vector.to_dlpack())
             }
-        };
-
-        // Get output pointer
-        let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
-            MahoutError::InvalidInput(
-                "State vector precision mismatch (expected float64 
buffer)".to_string(),
-            )
-        })?;
-
-        // Launch encoding kernel
-        {
-            crate::profile_scope!("GPU::KernelLaunch");
-            let ret = unsafe {
-                qdp_kernels::launch_amplitude_encode(
-                    input_d,
-                    state_ptr as *mut std::ffi::c_void,
-                    input_len,
-                    state_len,
-                    inv_norm,
-                    std::ptr::null_mut(), // default stream
-                )
-            };
-
-            if ret != 0 {
-                return Err(MahoutError::KernelLaunch(format!(
-                    "Amplitude encode kernel failed with CUDA error code: {} 
({})",
-                    ret,
-                    cuda_error_to_string(ret)
-                )));
+            "angle" => {
+                if input_len != num_qubits {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Angle encoding expects {} values (one per qubit), got 
{}",
+                        num_qubits, input_len
+                    )));
+                }
+
+                let state_vector = {
+                    crate::profile_scope!("GPU::Alloc");
+                    gpu::GpuStateVector::new(&self.device, num_qubits)?
+                };
+
+                let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+                    MahoutError::InvalidInput(
+                        "State vector precision mismatch (expected float64 
buffer)".to_string(),
+                    )
+                })?;
+
+                {
+                    crate::profile_scope!("GPU::KernelLaunch");
+                    let ret = unsafe {
+                        qdp_kernels::launch_angle_encode(
+                            input_d,
+                            state_ptr as *mut std::ffi::c_void,
+                            state_len,
+                            num_qubits as u32,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Angle encoding kernel failed with CUDA error 
code: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    self.device.synchronize().map_err(|e| {
+                        MahoutError::Cuda(format!("CUDA device synchronize 
failed: {:?}", e))
+                    })?;
+                }
+
+                let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
+                Ok(state_vector.to_dlpack())
             }

Review Comment:
   The new angle encoding GPU pointer paths don't validate that input angles 
are finite, unlike the regular angle encoding paths which check for finite 
values (see gpu/encodings/angle.rs lines 215-221 and encoding/angle.rs lines 
95-104). The amplitude GPU encoding path validates the norm after computation 
which indirectly catches non-finite values, but angle encoding has no 
equivalent validation. This means non-finite values (NaN, Inf) could silently 
propagate through sin/cos operations into the quantum state. Consider either: 
(1) adding a post-kernel validation step to check the output state for 
non-finite values, similar to how amplitude encoding validates the norm, or (2) 
explicitly documenting in the safety contract that callers must ensure input 
angles are finite for the GPU pointer API.



##########
qdp/qdp-core/src/lib.rs:
##########
@@ -469,106 +514,169 @@ impl QdpEngine {
         }
 
         let state_len = 1usize << num_qubits;
-        if sample_size > state_len {
-            return Err(MahoutError::InvalidInput(format!(
-                "Sample size {} exceeds state vector size {} (2^{} qubits)",
-                sample_size, state_len, num_qubits
-            )));
-        }
-
-        // Allocate output state vector
-        let batch_state_vector = {
-            crate::profile_scope!("GPU::AllocBatch");
-            gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
-        };
-
-        // Compute inverse norms on GPU using warp-reduced kernel
-        let inv_norms_gpu = {
-            crate::profile_scope!("GPU::BatchNormKernel");
-            use cudarc::driver::DevicePtrMut;
-
-            let mut buffer = 
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
-                MahoutError::MemoryAllocation(format!("Failed to allocate norm 
buffer: {:?}", e))
-            })?;
-
-            let ret = unsafe {
-                qdp_kernels::launch_l2_norm_batch(
-                    input_batch_d,
-                    num_samples,
-                    sample_size,
-                    *buffer.device_ptr_mut() as *mut f64,
-                    std::ptr::null_mut(), // default stream
-                )
-            };
-
-            if ret != 0 {
-                return Err(MahoutError::KernelLaunch(format!(
-                    "Norm reduction kernel failed with CUDA error code: {} 
({})",
-                    ret,
-                    cuda_error_to_string(ret)
-                )));
-            }
-
-            buffer
-        };
-
-        // Validate norms on host to catch zero or NaN samples early
-        {
-            crate::profile_scope!("GPU::NormValidation");
-            let host_inv_norms = self
-                .device
-                .dtoh_sync_copy(&inv_norms_gpu)
-                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms 
to host: {:?}", e)))?;
-
-            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
-                return Err(MahoutError::InvalidInput(
-                    "One or more samples have zero or invalid 
norm".to_string(),
-                ));
+        let method = encoding_method.to_ascii_lowercase();
+
+        match method.as_str() {
+            "amplitude" => {
+                if sample_size > state_len {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Sample size {} exceeds state vector size {} (2^{} 
qubits)",
+                        sample_size, state_len, num_qubits
+                    )));
+                }
+
+                let batch_state_vector = {
+                    crate::profile_scope!("GPU::AllocBatch");
+                    gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
+                };
+
+                let inv_norms_gpu = {
+                    crate::profile_scope!("GPU::BatchNormKernel");
+                    use cudarc::driver::DevicePtrMut;
+
+                    let mut buffer = 
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+                        MahoutError::MemoryAllocation(format!(
+                            "Failed to allocate norm buffer: {:?}",
+                            e
+                        ))
+                    })?;
+
+                    let ret = unsafe {
+                        qdp_kernels::launch_l2_norm_batch(
+                            input_batch_d,
+                            num_samples,
+                            sample_size,
+                            *buffer.device_ptr_mut() as *mut f64,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Norm reduction kernel failed with CUDA error 
code: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+
+                    buffer
+                };
+
+                {
+                    crate::profile_scope!("GPU::NormValidation");
+                    let host_inv_norms =
+                        self.device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| 
{
+                            MahoutError::Cuda(format!("Failed to copy norms to 
host: {:?}", e))
+                        })?;
+
+                    if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 
0.0) {
+                        return Err(MahoutError::InvalidInput(
+                            "One or more samples have zero or invalid 
norm".to_string(),
+                        ));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::BatchKernelLaunch");
+                    use cudarc::driver::DevicePtr;
+
+                    let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| 
{
+                        MahoutError::InvalidInput(
+                            "Batch state vector precision mismatch (expected 
float64 buffer)"
+                                .to_string(),
+                        )
+                    })?;
+
+                    let ret = unsafe {
+                        qdp_kernels::launch_amplitude_encode_batch(
+                            input_batch_d,
+                            state_ptr as *mut std::ffi::c_void,
+                            *inv_norms_gpu.device_ptr() as *const f64,
+                            num_samples,
+                            sample_size,
+                            state_len,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Batch kernel launch failed with CUDA error code: 
{} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    self.device
+                        .synchronize()
+                        .map_err(|e| MahoutError::Cuda(format!("Sync failed: 
{:?}", e)))?;
+                }
+
+                let batch_state_vector =
+                    batch_state_vector.to_precision(&self.device, 
self.precision)?;
+                Ok(batch_state_vector.to_dlpack())
             }
-        }
-
-        // Launch batch kernel
-        {
-            crate::profile_scope!("GPU::BatchKernelLaunch");
-            use cudarc::driver::DevicePtr;
-
-            let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
-                MahoutError::InvalidInput(
-                    "Batch state vector precision mismatch (expected float64 
buffer)".to_string(),
-                )
-            })?;
-
-            let ret = unsafe {
-                qdp_kernels::launch_amplitude_encode_batch(
-                    input_batch_d,
-                    state_ptr as *mut std::ffi::c_void,
-                    *inv_norms_gpu.device_ptr() as *const f64,
-                    num_samples,
-                    sample_size,
-                    state_len,
-                    std::ptr::null_mut(), // default stream
-                )
-            };
-
-            if ret != 0 {
-                return Err(MahoutError::KernelLaunch(format!(
-                    "Batch kernel launch failed with CUDA error code: {} ({})",
-                    ret,
-                    cuda_error_to_string(ret)
-                )));
+            "angle" => {
+                if sample_size != num_qubits {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Angle encoding expects sample_size={} (one angle per 
qubit), got {}",
+                        num_qubits, sample_size
+                    )));
+                }
+
+                let batch_state_vector = {
+                    crate::profile_scope!("GPU::AllocBatch");
+                    gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
+                };
+
+                let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+                    MahoutError::InvalidInput(
+                        "Batch state vector precision mismatch (expected 
float64 buffer)"
+                            .to_string(),
+                    )
+                })?;
+
+                {
+                    crate::profile_scope!("GPU::BatchKernelLaunch");
+                    let ret = unsafe {
+                        qdp_kernels::launch_angle_encode_batch(
+                            input_batch_d,
+                            state_ptr as *mut std::ffi::c_void,
+                            num_samples,
+                            state_len,
+                            num_qubits as u32,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Batch angle encoding kernel failed: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    self.device
+                        .synchronize()
+                        .map_err(|e| MahoutError::Cuda(format!("Sync failed: 
{:?}", e)))?;
+                }
+
+                let batch_state_vector =
+                    batch_state_vector.to_precision(&self.device, 
self.precision)?;
+                Ok(batch_state_vector.to_dlpack())
             }

Review Comment:
   The batch angle encoding GPU pointer path doesn't validate that input angles 
are finite, unlike the regular batch angle encoding path which checks for 
finite values (see encoding/angle.rs lines 95-104). This could allow non-finite 
values (NaN, Inf) to silently propagate into the quantum state. Consider adding 
validation consistent with the single-sample GPU pointer path, or documenting 
this as a caller responsibility in the safety contract.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [QDP] Extend CUDA tensor direct GPU encode to angle [mahout]

Reply via email to