(mahout) branch main updated: [QDP] Delegate GPU pointer encoding to QuantumEncoder trait (#1006)

guanmingchiu Fri, 06 Feb 2026 09:23:07 -0800

This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/main by this push:
     new 5371667d0 [QDP] Delegate GPU pointer encoding to QuantumEncoder trait 
(#1006)
5371667d0 is described below

commit 5371667d05aea52f3ce146d379f6c7cceb9fc230
Author: Vic Wen <[email protected]>
AuthorDate: Sat Feb 7 01:21:59 2026 +0800

    [QDP] Delegate GPU pointer encoding to QuantumEncoder trait (#1006)
    
    * refactor: delegate GPU pointer encoding to QuantumEncoder trait
    
    * test: error handling for unsupported GPU encoding methods
---
 qdp/qdp-core/src/gpu/encodings/amplitude.rs | 158 +++++++++
 qdp/qdp-core/src/gpu/encodings/angle.rs     | 154 +++++++++
 qdp/qdp-core/src/gpu/encodings/basis.rs     | 107 ++++++
 qdp/qdp-core/src/gpu/encodings/mod.rs       |  44 +++
 qdp/qdp-core/src/lib.rs                     | 492 ++--------------------------
 qdp/qdp-core/tests/gpu_ptr_encoding.rs      |  10 +-
 testing/qdp/test_bindings.py                |   5 +-
 7 files changed, 493 insertions(+), 477 deletions(-)

diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs 
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 85259e18a..fc57d189e 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -299,6 +299,164 @@ impl QuantumEncoder for AmplitudeEncoder {
         Ok(batch_state_vector)
     }
 
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_from_gpu_ptr(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_d: *const c_void,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        let state_len = 1 << num_qubits;
+        if input_len == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Input data cannot be empty".into(),
+            ));
+        }
+        if input_len > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Input size {} exceeds state vector size {} (2^{} qubits)",
+                input_len, state_len, num_qubits
+            )));
+        }
+        let input_d = input_d as *const f64;
+        let state_vector = {
+            crate::profile_scope!("GPU::Alloc");
+            GpuStateVector::new(device, num_qubits, Precision::Float64)?
+        };
+        let inv_norm = {
+            crate::profile_scope!("GPU::NormFromPtr");
+            unsafe { Self::calculate_inv_norm_gpu_with_stream(device, input_d, 
input_len, stream)? }
+        };
+        let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "State vector precision mismatch (expected float64 
buffer)".to_string(),
+            )
+        })?;
+        {
+            crate::profile_scope!("GPU::KernelLaunch");
+            let ret = unsafe {
+                launch_amplitude_encode(
+                    input_d,
+                    state_ptr as *mut c_void,
+                    input_len,
+                    state_len,
+                    inv_norm,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Amplitude encode kernel failed with CUDA error code: {} 
({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+        Ok(state_vector)
+    }
+
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_batch_d: *const c_void,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        let state_len = 1 << num_qubits;
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+        if sample_size > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Sample size {} exceeds state vector size {} (2^{} qubits)",
+                sample_size, state_len, num_qubits
+            )));
+        }
+        let input_batch_d = input_batch_d as *const f64;
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatch");
+            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+        };
+        let inv_norms_gpu = {
+            crate::profile_scope!("GPU::BatchNormKernel");
+            use cudarc::driver::DevicePtrMut;
+            let mut buffer = 
device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+                MahoutError::MemoryAllocation(format!("Failed to allocate norm 
buffer: {:?}", e))
+            })?;
+            let ret = unsafe {
+                launch_l2_norm_batch(
+                    input_batch_d,
+                    num_samples,
+                    sample_size,
+                    *buffer.device_ptr_mut() as *mut f64,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Norm reduction kernel failed with CUDA error code: {} 
({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+            buffer
+        };
+        {
+            crate::profile_scope!("GPU::NormValidation");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms 
to host: {:?}", e)))?;
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid 
norm".to_string(),
+                ));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunch");
+            use cudarc::driver::DevicePtr;
+            let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "Batch state vector precision mismatch (expected float64 
buffer)".to_string(),
+                )
+            })?;
+            let ret = unsafe {
+                launch_amplitude_encode_batch(
+                    input_batch_d,
+                    state_ptr as *mut c_void,
+                    *inv_norms_gpu.device_ptr() as *const f64,
+                    num_samples,
+                    sample_size,
+                    state_len,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch kernel launch failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+        Ok(batch_state_vector)
+    }
+
     fn name(&self) -> &'static str {
         "amplitude"
     }
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs 
b/qdp/qdp-core/src/gpu/encodings/angle.rs
index d1a1091d5..1c3e5b8f5 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -217,6 +217,160 @@ impl QuantumEncoder for AngleEncoder {
         Ok(batch_state_vector)
     }
 
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_from_gpu_ptr(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_d: *const c_void,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        if input_len != num_qubits {
+            return Err(MahoutError::InvalidInput(format!(
+                "Angle encoding expects {} values (one per qubit), got {}",
+                num_qubits, input_len
+            )));
+        }
+        let state_len = 1 << num_qubits;
+        let angles_d = input_d as *const f64;
+        let state_vector = {
+            crate::profile_scope!("GPU::Alloc");
+            GpuStateVector::new(device, num_qubits, Precision::Float64)?
+        };
+        let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "State vector precision mismatch (expected float64 
buffer)".to_string(),
+            )
+        })?;
+        {
+            crate::profile_scope!("GPU::KernelLaunch");
+            let ret = unsafe {
+                qdp_kernels::launch_angle_encode(
+                    angles_d,
+                    state_ptr as *mut c_void,
+                    state_len,
+                    num_qubits as u32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Angle encoding kernel failed with CUDA error code: {} 
({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
+        }
+        Ok(state_vector)
+    }
+
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_batch_d: *const c_void,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+        if sample_size != num_qubits {
+            return Err(MahoutError::InvalidInput(format!(
+                "Angle encoding expects sample_size={} (one angle per qubit), 
got {}",
+                num_qubits, sample_size
+            )));
+        }
+        let state_len = 1 << num_qubits;
+        let input_batch_d = input_batch_d as *const f64;
+        let angle_validation_buffer = {
+            crate::profile_scope!("GPU::AngleFiniteCheckBatch");
+            use cudarc::driver::DevicePtrMut;
+            let mut buffer = 
device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+                MahoutError::MemoryAllocation(format!(
+                    "Failed to allocate angle validation buffer: {:?}",
+                    e
+                ))
+            })?;
+            let ret = unsafe {
+                qdp_kernels::launch_l2_norm_batch(
+                    input_batch_d,
+                    num_samples,
+                    sample_size,
+                    *buffer.device_ptr_mut() as *mut f64,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Angle validation norm kernel failed with CUDA error code: 
{} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+            buffer
+        };
+        {
+            crate::profile_scope!("GPU::AngleFiniteValidationHostCopy");
+            let host_norms = device
+                .dtoh_sync_copy(&angle_validation_buffer)
+                .map_err(|e| {
+                    MahoutError::Cuda(format!(
+                        "Failed to copy angle validation norms to host: {:?}",
+                        e
+                    ))
+                })?;
+            if host_norms.iter().any(|v| !v.is_finite()) {
+                return Err(MahoutError::InvalidInput(
+                    "Angle encoding batch contains non-finite values (NaN or 
Inf)".to_string(),
+                ));
+            }
+        }
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatch");
+            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+        };
+        let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "Batch state vector precision mismatch (expected float64 
buffer)".to_string(),
+            )
+        })?;
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunch");
+            let ret = unsafe {
+                qdp_kernels::launch_angle_encode_batch(
+                    input_batch_d,
+                    state_ptr as *mut c_void,
+                    num_samples,
+                    state_len,
+                    num_qubits as u32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch angle encoding kernel failed: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
+        }
+        Ok(batch_state_vector)
+    }
+
     fn validate_input(&self, data: &[f64], num_qubits: usize) -> Result<()> {
         validate_qubit_count(num_qubits)?;
         if data.len() != num_qubits {
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs 
b/qdp/qdp-core/src/gpu/encodings/basis.rs
index 33e8f14bf..569e1455e 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -225,6 +225,113 @@ impl QuantumEncoder for BasisEncoder {
         Ok(batch_state_vector)
     }
 
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_from_gpu_ptr(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_d: *const c_void,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        if input_len != 1 {
+            return Err(MahoutError::InvalidInput(format!(
+                "Basis encoding expects exactly 1 value (the basis index), got 
{}",
+                input_len
+            )));
+        }
+        let state_len = 1 << num_qubits;
+        let basis_indices_d = input_d as *const usize;
+        let state_vector = {
+            crate::profile_scope!("GPU::Alloc");
+            GpuStateVector::new(device, num_qubits, Precision::Float64)?
+        };
+        let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "State vector precision mismatch (expected float64 
buffer)".to_string(),
+            )
+        })?;
+        {
+            crate::profile_scope!("GPU::KernelLaunch");
+            let ret = unsafe {
+                qdp_kernels::launch_basis_encode_batch(
+                    basis_indices_d,
+                    state_ptr as *mut c_void,
+                    1,
+                    state_len,
+                    num_qubits as u32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Basis encoding kernel failed with CUDA error code: {} 
({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
+        }
+        Ok(state_vector)
+    }
+
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_batch_d: *const c_void,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        if sample_size != 1 {
+            return Err(MahoutError::InvalidInput(format!(
+                "Basis encoding expects sample_size=1 (one index per sample), 
got {}",
+                sample_size
+            )));
+        }
+        let state_len = 1 << num_qubits;
+        let basis_indices_d = input_batch_d as *const usize;
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatch");
+            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+        };
+        let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "Batch state vector precision mismatch (expected float64 
buffer)".to_string(),
+            )
+        })?;
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunch");
+            let ret = unsafe {
+                qdp_kernels::launch_basis_encode_batch(
+                    basis_indices_d,
+                    state_ptr as *mut c_void,
+                    num_samples,
+                    state_len,
+                    num_qubits as u32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch basis encoding kernel failed with CUDA error code: 
{} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
+        }
+        Ok(batch_state_vector)
+    }
+
     fn validate_input(&self, data: &[f64], num_qubits: usize) -> Result<()> {
         // Basic validation: qubits and data availability
         validate_qubit_count(num_qubits)?;
diff --git a/qdp/qdp-core/src/gpu/encodings/mod.rs 
b/qdp/qdp-core/src/gpu/encodings/mod.rs
index ad1a9577a..319c8068a 100644
--- a/qdp/qdp-core/src/gpu/encodings/mod.rs
+++ b/qdp/qdp-core/src/gpu/encodings/mod.rs
@@ -22,6 +22,8 @@ use crate::error::{MahoutError, Result};
 use crate::gpu::memory::GpuStateVector;
 use crate::preprocessing::Preprocessor;
 use cudarc::driver::CudaDevice;
+#[cfg(target_os = "linux")]
+use std::ffi::c_void;
 
 /// Maximum number of qubits supported (16GB GPU memory limit)
 /// This constant must match MAX_QUBITS in qdp-kernels/src/kernel_config.h
@@ -90,6 +92,48 @@ pub trait QuantumEncoder: Send + Sync {
 
     /// Strategy description
     fn description(&self) -> &'static str;
+
+    /// Encode from existing GPU pointer (zero-copy). Default: not supported.
+    ///
+    /// # Safety
+    /// Caller must ensure `input_d` points to valid GPU memory with at least 
`input_len` elements
+    /// of the expected dtype on the same device as `device`, and `stream` is 
a valid CUDA stream or null.
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_from_gpu_ptr(
+        &self,
+        _device: &Arc<CudaDevice>,
+        _input_d: *const c_void,
+        _input_len: usize,
+        _num_qubits: usize,
+        _stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        Err(MahoutError::NotImplemented(format!(
+            "encode_from_gpu_ptr not supported for {}",
+            self.name()
+        )))
+    }
+
+    /// Encode batch from existing GPU pointer (zero-copy). Default: not 
supported.
+    ///
+    /// # Safety
+    /// Caller must ensure `input_batch_d` points to valid GPU memory with at 
least
+    /// `num_samples * sample_size` elements of the expected dtype on the same 
device as `device`,
+    /// and `stream` is a valid CUDA stream or null.
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr(
+        &self,
+        _device: &Arc<CudaDevice>,
+        _input_batch_d: *const c_void,
+        _num_samples: usize,
+        _sample_size: usize,
+        _num_qubits: usize,
+        _stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        Err(MahoutError::NotImplemented(format!(
+            "encode_batch_from_gpu_ptr not supported for {}",
+            self.name()
+        )))
+    }
 }
 
 // Encoding implementations
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index bf813c470..65cea496e 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -413,10 +413,6 @@ impl QdpEngine {
     /// a raw GPU pointer directly, avoiding the GPU→CPU→GPU copy that would 
otherwise
     /// be required.
     ///
-    /// TODO: Refactor to use QuantumEncoder trait (add `encode_from_gpu_ptr` 
to trait)
-    /// to reduce duplication with AmplitudeEncoder::encode(). This would also 
make it
-    /// easier to add GPU pointer support for other encoders (angle, basis) in 
the future.
-    ///
     /// # Arguments
     /// * `input_d` - Device pointer to input data (f64 for amplitude/angle, 
usize/int64 for basis)
     /// * `input_len` - Number of elements in the input
@@ -468,7 +464,6 @@ impl QdpEngine {
         stream: *mut std::ffi::c_void,
     ) -> Result<*mut DLManagedTensor> {
         crate::profile_scope!("Mahout::EncodeFromGpuPtr");
-
         if input_len == 0 {
             return Err(MahoutError::InvalidInput(
                 "Input data cannot be empty".into(),
@@ -477,186 +472,12 @@ impl QdpEngine {
 
         validate_cuda_input_ptr(&self.device, input_d)?;
 
-        let state_len = 1usize << num_qubits;
-        let method = encoding_method.to_ascii_lowercase();
-
-        match method.as_str() {
-            "amplitude" => {
-                if input_len == 0 {
-                    return Err(MahoutError::InvalidInput(
-                        "Input data cannot be empty".into(),
-                    ));
-                }
-
-                if input_len > state_len {
-                    return Err(MahoutError::InvalidInput(format!(
-                        "Input size {} exceeds state vector size {} (2^{} 
qubits)",
-                        input_len, state_len, num_qubits
-                    )));
-                }
-
-                let input_d = input_d as *const f64;
-
-                let state_vector = {
-                    crate::profile_scope!("GPU::Alloc");
-                    gpu::GpuStateVector::new(&self.device, num_qubits, 
Precision::Float64)?
-                };
-
-                let inv_norm = {
-                    crate::profile_scope!("GPU::NormFromPtr");
-                    // SAFETY: input_d validity is guaranteed by the caller's 
safety contract
-                    unsafe {
-                        gpu::AmplitudeEncoder::calculate_inv_norm_gpu(
-                            &self.device,
-                            input_d,
-                            input_len,
-                        )?
-                    }
-                };
-
-                let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
-                    MahoutError::InvalidInput(
-                        "State vector precision mismatch (expected float64 
buffer)".to_string(),
-                    )
-                })?;
-
-                {
-                    crate::profile_scope!("GPU::KernelLaunch");
-                    let ret = unsafe {
-                        qdp_kernels::launch_amplitude_encode(
-                            input_d,
-                            state_ptr as *mut std::ffi::c_void,
-                            input_len,
-                            state_len,
-                            inv_norm,
-                            stream,
-                        )
-                    };
-
-                    if ret != 0 {
-                        return Err(MahoutError::KernelLaunch(format!(
-                            "Amplitude encode kernel failed with CUDA error 
code: {} ({})",
-                            ret,
-                            cuda_error_to_string(ret)
-                        )));
-                    }
-                }
-
-                {
-                    crate::profile_scope!("GPU::Synchronize");
-                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
-                }
-
-                let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
-                Ok(state_vector.to_dlpack())
-            }
-            "angle" => {
-                if input_len != num_qubits {
-                    return Err(MahoutError::InvalidInput(format!(
-                        "Angle encoding expects {} values (one per qubit), got 
{}",
-                        num_qubits, input_len
-                    )));
-                }
-
-                let angles_d = input_d as *const f64;
-
-                let state_vector = {
-                    crate::profile_scope!("GPU::Alloc");
-                    gpu::GpuStateVector::new(&self.device, num_qubits, 
Precision::Float64)?
-                };
-
-                let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
-                    MahoutError::InvalidInput(
-                        "State vector precision mismatch (expected float64 
buffer)".to_string(),
-                    )
-                })?;
-
-                {
-                    crate::profile_scope!("GPU::KernelLaunch");
-                    let ret = unsafe {
-                        qdp_kernels::launch_angle_encode(
-                            angles_d,
-                            state_ptr as *mut std::ffi::c_void,
-                            state_len,
-                            num_qubits as u32,
-                            stream,
-                        )
-                    };
-
-                    if ret != 0 {
-                        return Err(MahoutError::KernelLaunch(format!(
-                            "Angle encoding kernel failed with CUDA error 
code: {} ({})",
-                            ret,
-                            cuda_error_to_string(ret)
-                        )));
-                    }
-                }
-
-                {
-                    crate::profile_scope!("GPU::Synchronize");
-                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
-                }
-
-                let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
-                Ok(state_vector.to_dlpack())
-            }
-            "basis" => {
-                if input_len != 1 {
-                    return Err(MahoutError::InvalidInput(format!(
-                        "Basis encoding expects exactly 1 value (the basis 
index), got {}",
-                        input_len
-                    )));
-                }
-
-                let basis_indices_d = input_d as *const usize;
-
-                let state_vector = {
-                    crate::profile_scope!("GPU::Alloc");
-                    gpu::GpuStateVector::new(&self.device, num_qubits, 
self.precision)?
-                };
-
-                let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
-                    MahoutError::InvalidInput(
-                        "State vector precision mismatch (expected float64 
buffer)".to_string(),
-                    )
-                })?;
-
-                // Use batch API with num_samples=1 to avoid D2H copy; 
launch_basis_encode takes host usize.
-                {
-                    crate::profile_scope!("GPU::KernelLaunch");
-                    let ret = unsafe {
-                        qdp_kernels::launch_basis_encode_batch(
-                            basis_indices_d,
-                            state_ptr as *mut std::ffi::c_void,
-                            1,
-                            state_len,
-                            num_qubits as u32,
-                            stream,
-                        )
-                    };
-
-                    if ret != 0 {
-                        return Err(MahoutError::KernelLaunch(format!(
-                            "Basis encoding kernel failed with CUDA error 
code: {} ({})",
-                            ret,
-                            cuda_error_to_string(ret)
-                        )));
-                    }
-                }
-
-                {
-                    crate::profile_scope!("GPU::Synchronize");
-                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
-                }
-
-                let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
-                Ok(state_vector.to_dlpack())
-            }
-            _ => Err(MahoutError::NotImplemented(format!(
-                "GPU pointer encoding currently only supports 'amplitude', 
'angle', or 'basis' methods, got '{}'",
-                encoding_method
-            ))),
-        }
+        let encoder = get_encoder(encoding_method)?;
+        let state_vector = unsafe {
+            encoder.encode_from_gpu_ptr(&self.device, input_d, input_len, 
num_qubits, stream)
+        }?;
+        let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
+        Ok(state_vector.to_dlpack())
     }
 
     /// Encode from existing GPU pointer (float32 input, amplitude encoding 
only)
@@ -787,8 +608,6 @@ impl QdpEngine {
     ///
     /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
     ///
-    /// TODO: Refactor to use QuantumEncoder trait (see `encode_from_gpu_ptr` 
TODO).
-    ///
     /// # Arguments
     /// * `input_batch_d` - Device pointer to batch input data (f64 for 
amplitude/angle, usize/int64 for basis)
     /// * `num_samples` - Number of samples in the batch
@@ -844,10 +663,6 @@ impl QdpEngine {
         stream: *mut std::ffi::c_void,
     ) -> Result<*mut DLManagedTensor> {
         crate::profile_scope!("Mahout::EncodeBatchFromGpuPtr");
-
-        let state_len = 1usize << num_qubits;
-        let method = encoding_method.to_ascii_lowercase();
-
         if num_samples == 0 {
             return Err(MahoutError::InvalidInput(
                 "Number of samples cannot be zero".into(),
@@ -862,288 +677,19 @@ impl QdpEngine {
 
         validate_cuda_input_ptr(&self.device, input_batch_d)?;
 
-        match method.as_str() {
-            "amplitude" => {
-                if sample_size == 0 {
-                    return Err(MahoutError::InvalidInput(
-                        "Sample size cannot be zero".into(),
-                    ));
-                }
-
-                if sample_size > state_len {
-                    return Err(MahoutError::InvalidInput(format!(
-                        "Sample size {} exceeds state vector size {} (2^{} 
qubits)",
-                        sample_size, state_len, num_qubits
-                    )));
-                }
-
-                let input_batch_d = input_batch_d as *const f64;
-
-                let batch_state_vector = {
-                    crate::profile_scope!("GPU::AllocBatch");
-                    gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
-                };
-
-                let inv_norms_gpu = {
-                    crate::profile_scope!("GPU::BatchNormKernel");
-                    use cudarc::driver::DevicePtrMut;
-
-                    let mut buffer = 
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
-                        MahoutError::MemoryAllocation(format!(
-                            "Failed to allocate norm buffer: {:?}",
-                            e
-                        ))
-                    })?;
-
-                    let ret = unsafe {
-                        qdp_kernels::launch_l2_norm_batch(
-                            input_batch_d,
-                            num_samples,
-                            sample_size,
-                            *buffer.device_ptr_mut() as *mut f64,
-                            stream,
-                        )
-                    };
-
-                    if ret != 0 {
-                        return Err(MahoutError::KernelLaunch(format!(
-                            "Norm reduction kernel failed with CUDA error 
code: {} ({})",
-                            ret,
-                            cuda_error_to_string(ret)
-                        )));
-                    }
-
-                    buffer
-                };
-
-                // Validate norms on host to catch zero or NaN samples early
-                {
-                    crate::profile_scope!("GPU::NormValidation");
-                    let host_inv_norms =
-                        self.device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| 
{
-                            MahoutError::Cuda(format!("Failed to copy norms to 
host: {:?}", e))
-                        })?;
-
-                    if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 
0.0) {
-                        return Err(MahoutError::InvalidInput(
-                            "One or more samples have zero or invalid 
norm".to_string(),
-                        ));
-                    }
-                }
-
-                {
-                    crate::profile_scope!("GPU::BatchKernelLaunch");
-                    use cudarc::driver::DevicePtr;
-
-                    let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| 
{
-                        MahoutError::InvalidInput(
-                            "Batch state vector precision mismatch (expected 
float64 buffer)"
-                                .to_string(),
-                        )
-                    })?;
-
-                    let ret = unsafe {
-                        qdp_kernels::launch_amplitude_encode_batch(
-                            input_batch_d,
-                            state_ptr as *mut std::ffi::c_void,
-                            *inv_norms_gpu.device_ptr() as *const f64,
-                            num_samples,
-                            sample_size,
-                            state_len,
-                            stream,
-                        )
-                    };
-
-                    if ret != 0 {
-                        return Err(MahoutError::KernelLaunch(format!(
-                            "Batch kernel launch failed with CUDA error code: 
{} ({})",
-                            ret,
-                            cuda_error_to_string(ret)
-                        )));
-                    }
-                }
-
-                {
-                    crate::profile_scope!("GPU::Synchronize");
-                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
-                }
-
-                let batch_state_vector =
-                    batch_state_vector.to_precision(&self.device, 
self.precision)?;
-                Ok(batch_state_vector.to_dlpack())
-            }
-            "angle" => {
-                use cudarc::driver::DevicePtrMut;
-
-                if sample_size == 0 {
-                    return Err(MahoutError::InvalidInput(
-                        "Sample size cannot be zero".into(),
-                    ));
-                }
-
-                if sample_size != num_qubits {
-                    return Err(MahoutError::InvalidInput(format!(
-                        "Angle encoding expects sample_size={} (one angle per 
qubit), got {}",
-                        num_qubits, sample_size
-                    )));
-                }
-
-                let input_batch_d = input_batch_d as *const f64;
-
-                // Validate that all input angles are finite (no NaN/Inf), 
consistent with
-                // CPU and host-side batch angle encoding paths.
-                let angle_validation_buffer = {
-                    crate::profile_scope!("GPU::AngleFiniteCheckBatch");
-
-                    let mut buffer = 
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
-                        MahoutError::MemoryAllocation(format!(
-                            "Failed to allocate angle validation buffer: {:?}",
-                            e
-                        ))
-                    })?;
-
-                    let ret = unsafe {
-                        qdp_kernels::launch_l2_norm_batch(
-                            input_batch_d,
-                            num_samples,
-                            sample_size,
-                            *buffer.device_ptr_mut() as *mut f64,
-                            stream,
-                        )
-                    };
-
-                    if ret != 0 {
-                        return Err(MahoutError::KernelLaunch(format!(
-                            "Angle validation norm kernel failed with CUDA 
error code: {} ({})",
-                            ret,
-                            cuda_error_to_string(ret)
-                        )));
-                    }
-
-                    buffer
-                };
-
-                {
-                    
crate::profile_scope!("GPU::AngleFiniteValidationHostCopy");
-                    let host_norms = self
-                        .device
-                        .dtoh_sync_copy(&angle_validation_buffer)
-                        .map_err(|e| {
-                            MahoutError::Cuda(format!(
-                                "Failed to copy angle validation norms to 
host: {:?}",
-                                e
-                            ))
-                        })?;
-
-                    if host_norms.iter().any(|v| !v.is_finite()) {
-                        return Err(MahoutError::InvalidInput(
-                            "Angle encoding batch contains non-finite values 
(NaN or Inf)"
-                                .to_string(),
-                        ));
-                    }
-                }
-
-                let batch_state_vector = {
-                    crate::profile_scope!("GPU::AllocBatch");
-                    gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
-                };
-
-                let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
-                    MahoutError::InvalidInput(
-                        "Batch state vector precision mismatch (expected 
float64 buffer)"
-                            .to_string(),
-                    )
-                })?;
-
-                {
-                    crate::profile_scope!("GPU::BatchKernelLaunch");
-                    let ret = unsafe {
-                        qdp_kernels::launch_angle_encode_batch(
-                            input_batch_d,
-                            state_ptr as *mut std::ffi::c_void,
-                            num_samples,
-                            state_len,
-                            num_qubits as u32,
-                            stream,
-                        )
-                    };
-
-                    if ret != 0 {
-                        return Err(MahoutError::KernelLaunch(format!(
-                            "Batch angle encoding kernel failed: {} ({})",
-                            ret,
-                            cuda_error_to_string(ret)
-                        )));
-                    }
-                }
-
-                {
-                    crate::profile_scope!("GPU::Synchronize");
-                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
-                }
-
-                let batch_state_vector =
-                    batch_state_vector.to_precision(&self.device, 
self.precision)?;
-                Ok(batch_state_vector.to_dlpack())
-            }
-            "basis" => {
-                if sample_size != 1 {
-                    return Err(MahoutError::InvalidInput(format!(
-                        "Basis encoding expects sample_size=1 (one index per 
sample), got {}",
-                        sample_size
-                    )));
-                }
-
-                let basis_indices_d = input_batch_d as *const usize;
-
-                let batch_state_vector = {
-                    crate::profile_scope!("GPU::AllocBatch");
-                    gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
-                };
-
-                let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
-                    MahoutError::InvalidInput(
-                        "Batch state vector precision mismatch (expected 
float64 buffer)"
-                            .to_string(),
-                    )
-                })?;
-
-                {
-                    crate::profile_scope!("GPU::BatchKernelLaunch");
-                    let ret = unsafe {
-                        qdp_kernels::launch_basis_encode_batch(
-                            basis_indices_d,
-                            state_ptr as *mut std::ffi::c_void,
-                            num_samples,
-                            state_len,
-                            num_qubits as u32,
-                            stream,
-                        )
-                    };
-
-                    if ret != 0 {
-                        return Err(MahoutError::KernelLaunch(format!(
-                            "Batch basis encoding kernel failed with CUDA 
error code: {} ({})",
-                            ret,
-                            cuda_error_to_string(ret)
-                        )));
-                    }
-                }
-
-                {
-                    crate::profile_scope!("GPU::Synchronize");
-                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
-                }
-
-                let batch_state_vector =
-                    batch_state_vector.to_precision(&self.device, 
self.precision)?;
-                Ok(batch_state_vector.to_dlpack())
-            }
-            _ => Err(MahoutError::NotImplemented(format!(
-                "GPU pointer batch encoding currently only supports 
'amplitude', 'angle', or 'basis' methods, got '{}'",
-                encoding_method
-            ))),
-        }
+        let encoder = get_encoder(encoding_method)?;
+        let batch_state_vector = unsafe {
+            encoder.encode_batch_from_gpu_ptr(
+                &self.device,
+                input_batch_d,
+                num_samples,
+                sample_size,
+                num_qubits,
+                stream,
+            )
+        }?;
+        let batch_state_vector = batch_state_vector.to_precision(&self.device, 
self.precision)?;
+        Ok(batch_state_vector.to_dlpack())
     }
 }
 
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs 
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index e9f23da34..c672d6956 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -80,7 +80,10 @@ fn test_encode_from_gpu_ptr_unknown_method() {
         Err(MahoutError::NotImplemented(msg)) => {
             assert!(msg.contains("unknown_encoding") || msg.contains("only 
supports"));
         }
-        _ => panic!("expected NotImplemented, got {:?}", result),
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("Unknown encoder") || 
msg.contains("unknown_encoding"));
+        }
+        _ => panic!("expected NotImplemented or InvalidInput, got {:?}", 
result),
     }
 }
 
@@ -159,7 +162,10 @@ fn test_encode_batch_from_gpu_ptr_unknown_method() {
         Err(MahoutError::NotImplemented(msg)) => {
             assert!(msg.contains("unknown_method") || msg.contains("only 
supports"));
         }
-        _ => panic!("expected NotImplemented, got {:?}", result),
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("Unknown encoder") || 
msg.contains("unknown_method"));
+        }
+        _ => panic!("expected NotImplemented or InvalidInput, got {:?}", 
result),
     }
 }
 
diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py
index fe6b07368..007def16e 100644
--- a/testing/qdp/test_bindings.py
+++ b/testing/qdp/test_bindings.py
@@ -424,7 +424,7 @@ def test_encode_cuda_tensor_preserves_input(data_shape, 
is_batch):
 @pytest.mark.gpu
 @pytest.mark.parametrize("encoding_method", ["iqp"])
 def test_encode_cuda_tensor_unsupported_encoding(encoding_method):
-    """Test error when using CUDA tensor with unsupported encoding (CUDA 
supports amplitude, angle, and basis only)."""
+    """Test error when using CUDA tensor with an encoding not supported on GPU 
(only amplitude, angle, basis)."""
     pytest.importorskip("torch")
     from _qdp import QdpEngine
 
@@ -433,11 +433,12 @@ def 
test_encode_cuda_tensor_unsupported_encoding(encoding_method):
 
     engine = QdpEngine(0)
 
+    # CUDA path only supports amplitude, angle, basis; iqp/iqp-z should raise 
unsupported error
     data = torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float64, 
device="cuda:0")
 
     with pytest.raises(
         RuntimeError,
-        match="only supports 'amplitude', 'angle', or 'basis' methods.*Use 
tensor.cpu\\(\\)",
+        match="only supports .*amplitude.*angle.*basis.*Use tensor.cpu",
     ):
         engine.encode(data, 2, encoding_method)

(mahout) branch main updated: [QDP] Delegate GPU pointer encoding to QuantumEncoder trait (#1006)

Reply via email to