This is an automated email from the ASF dual-hosted git repository.

ryankert01 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


The following commit(s) were added to refs/heads/main by this push:
     new 51104cbce feat(qdp): hoist encode_from_gpu_ptr_f32 onto QuantumEncoder 
trait (#1310)
51104cbce is described below

commit 51104cbcee2d7ae22e8d287adc1deda3dc41c263
Author: KUAN-HAO HUANG <[email protected]>
AuthorDate: Mon Jun 1 16:18:29 2026 +0800

    feat(qdp): hoist encode_from_gpu_ptr_f32 onto QuantumEncoder trait (#1310)
---
 qdp/qdp-core/src/gpu/encodings/amplitude.rs | 103 ++++++++++++++++++++
 qdp/qdp-core/src/gpu/encodings/angle.rs     |  22 +++++
 qdp/qdp-core/src/gpu/encodings/basis.rs     |  23 +++++
 qdp/qdp-core/src/gpu/encodings/mod.rs       |  28 ++++++
 qdp/qdp-core/src/lib.rs                     |  74 +++-----------
 qdp/qdp-core/tests/gpu_ptr_encoding.rs      | 143 ++++++++++++++++++++++++++++
 6 files changed, 331 insertions(+), 62 deletions(-)

diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs 
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 67d9f06ac..7cf70d9ec 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -689,6 +689,27 @@ impl QuantumEncoder for AmplitudeEncoder {
         Ok(batch_state_vector)
     }
 
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_from_gpu_ptr_f32(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_d: *const c_void,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        // Delegate to the workhorse `_with_stream` fn (see angle.rs for 
rationale).
+        unsafe {
+            Self::encode_from_gpu_ptr_f32_with_stream(
+                device,
+                input_d as *const f32,
+                input_len,
+                num_qubits,
+                stream,
+            )
+        }
+    }
+
     fn name(&self) -> &'static str {
         "amplitude"
     }
@@ -811,6 +832,88 @@ impl AmplitudeEncoder {
 }
 
 impl AmplitudeEncoder {
+    /// Encode a single sample directly from a GPU float32 pointer, returning a
+    /// `GpuStateVector` (the engine wraps it as DLPack at the public 
boundary).
+    ///
+    /// Symmetric with `AngleEncoder::encode_from_gpu_ptr_f32_with_stream` and
+    /// `BasisEncoder::encode_from_gpu_ptr_f32_with_stream`. The previous 
arrangement
+    /// (`QdpEngine::encode_from_gpu_ptr_f32_with_stream` did this inline in 
`lib.rs`)
+    /// made the trait surface asymmetric — only the batch variant had a real
+    /// `QuantumEncoder` override on amplitude.
+    ///
+    /// # Safety
+    /// Caller must ensure `input_d` points to at least `input_len` `f32` 
values in
+    /// GPU-accessible memory on the same device as `device`, and `stream` is 
either
+    /// null or a valid CUDA stream associated with `device`.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
+        device: &Arc<CudaDevice>,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        if input_len == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Input data cannot be empty".into(),
+            ));
+        }
+
+        let state_len = 1usize << num_qubits;
+        if input_len > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Input size {} exceeds state vector size {} (2^{} qubits)",
+                input_len, state_len, num_qubits
+            )));
+        }
+
+        let state_vector = {
+            crate::profile_scope!("GPU::Alloc");
+            GpuStateVector::new(device, num_qubits, Precision::Float32)?
+        };
+
+        let inv_norm = {
+            crate::profile_scope!("GPU::NormFromPtr");
+            unsafe {
+                Self::calculate_inv_norm_gpu_f32_with_stream(device, input_d, 
input_len, stream)?
+            }
+        };
+
+        let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "State vector precision mismatch (expected float32 
buffer)".to_string(),
+            )
+        })?;
+
+        {
+            crate::profile_scope!("GPU::KernelLaunch");
+            let ret = unsafe {
+                qdp_kernels::launch_amplitude_encode_f32(
+                    input_d,
+                    state_ptr as *mut c_void,
+                    input_len,
+                    state_len,
+                    inv_norm,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Amplitude encode (f32) kernel failed with CUDA error 
code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+
+        Ok(state_vector)
+    }
+
     /// Encode a batch directly from a GPU float32 pointer.
     ///
     /// # Safety
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs 
b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 99a9d2998..36e784e03 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -595,6 +595,28 @@ impl QuantumEncoder for AngleEncoder {
         Ok(())
     }
 
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_from_gpu_ptr_f32(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_d: *const c_void,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        // Delegate to the workhorse `_with_stream` fn (kept as the inherent 
impl so
+        // it can be called without a vtable on hot paths like `engine.rs`).
+        unsafe {
+            Self::encode_from_gpu_ptr_f32_with_stream(
+                device,
+                input_d as *const f32,
+                input_len,
+                num_qubits,
+                stream,
+            )
+        }
+    }
+
     fn name(&self) -> &'static str {
         "angle"
     }
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs 
b/qdp/qdp-core/src/gpu/encodings/basis.rs
index 1db78cdc1..4f2cec15e 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -541,6 +541,29 @@ impl QuantumEncoder for BasisEncoder {
         Ok(batch_state_vector)
     }
 
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_from_gpu_ptr_f32(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_d: *const c_void,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        // Delegate to the workhorse `_with_stream` fn (see angle.rs for 
rationale).
+        // `input_len` is unused — basis is always one index per sample — but 
kept on the
+        // signature to match the trait shape used by amplitude / angle.
+        let _ = input_len;
+        unsafe {
+            Self::encode_from_gpu_ptr_f32_with_stream(
+                device,
+                input_d as *const f32,
+                num_qubits,
+                stream,
+            )
+        }
+    }
+
     fn name(&self) -> &'static str {
         "basis"
     }
diff --git a/qdp/qdp-core/src/gpu/encodings/mod.rs 
b/qdp/qdp-core/src/gpu/encodings/mod.rs
index 3f256e68a..8d0fd5b4c 100644
--- a/qdp/qdp-core/src/gpu/encodings/mod.rs
+++ b/qdp/qdp-core/src/gpu/encodings/mod.rs
@@ -135,6 +135,34 @@ pub trait QuantumEncoder: Send + Sync + 'static {
         )))
     }
 
+    /// Encode a single sample from an existing GPU pointer (zero-copy) using 
an f32 input.
+    /// Default: not supported.
+    ///
+    /// This is the f32 counterpart of 
[`encode_from_gpu_ptr`](Self::encode_from_gpu_ptr). The
+    /// sibling batch variant is 
[`encode_batch_from_gpu_ptr_f32`](Self::encode_batch_from_gpu_ptr_f32).
+    /// Adding a new encoder with f32 zero-copy support should override 
**both** this method and
+    /// the batch variant; the previous arrangement (single-sample as a 
standalone `pub unsafe
+    /// fn` on each encoder type, batch on the trait) made the pattern 
accidentally divergent.
+    ///
+    /// # Safety
+    /// Caller must ensure `input_d` points to valid GPU memory with at least 
`input_len`
+    /// `f32` elements on the same device as `device`, and `stream` is either 
null or a valid
+    /// CUDA stream associated with `device`.
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_from_gpu_ptr_f32(
+        &self,
+        _device: &Arc<CudaDevice>,
+        _input_d: *const c_void,
+        _input_len: usize,
+        _num_qubits: usize,
+        _stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        Err(MahoutError::NotImplemented(format!(
+            "encode_from_gpu_ptr_f32 not supported for {}",
+            self.name()
+        )))
+    }
+
     /// Encode multiple samples in a single GPU allocation and kernel launch 
using f32 inputs.
     fn encode_batch_f32(
         &self,
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 5af3dd199..72d00898e 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -610,72 +610,22 @@ impl QdpEngine {
     ) -> Result<*mut DLManagedTensor> {
         crate::profile_scope!("Mahout::EncodeFromGpuPtrF32");
 
-        if input_len == 0 {
-            return Err(MahoutError::InvalidInput(
-                "Input data cannot be empty".into(),
-            ));
-        }
-
         validate_cuda_input_ptr(&self.device, input_d as *const c_void)?;
 
-        let state_len = 1usize << num_qubits;
-        if input_len > state_len {
-            return Err(MahoutError::InvalidInput(format!(
-                "Input size {} exceeds state vector size {} (2^{} qubits)",
-                input_len, state_len, num_qubits
-            )));
-        }
-
-        let state_vector = {
-            crate::profile_scope!("GPU::Alloc");
-            gpu::GpuStateVector::new(&self.device, num_qubits, 
Precision::Float32)?
-        };
-
-        let inv_norm = {
-            crate::profile_scope!("GPU::NormFromPtr");
-            unsafe {
-                gpu::AmplitudeEncoder::calculate_inv_norm_gpu_f32_with_stream(
-                    &self.device,
-                    input_d,
-                    input_len,
-                    stream,
-                )?
-            }
+        // Delegate to `AmplitudeEncoder::encode_from_gpu_ptr_f32_with_stream` 
— the
+        // encoder-side workhorse. Keeping the kernel-launch + L2-norm 
sequence inside
+        // the encoder makes the trait surface symmetric 
(`QuantumEncoder::encode_from_gpu_ptr_f32`
+        // can override against it) and matches the angle / basis layout.
+        let state_vector = unsafe {
+            gpu::AmplitudeEncoder::encode_from_gpu_ptr_f32_with_stream(
+                &self.device,
+                input_d,
+                input_len,
+                num_qubits,
+                stream,
+            )?
         };
 
-        let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
-            MahoutError::InvalidInput(
-                "State vector precision mismatch (expected float32 
buffer)".to_string(),
-            )
-        })?;
-
-        {
-            crate::profile_scope!("GPU::KernelLaunch");
-            let ret = unsafe {
-                qdp_kernels::launch_amplitude_encode_f32(
-                    input_d,
-                    state_ptr as *mut std::ffi::c_void,
-                    input_len,
-                    state_len,
-                    inv_norm,
-                    stream,
-                )
-            };
-
-            if ret != 0 {
-                return Err(MahoutError::KernelLaunch(format!(
-                    "Amplitude encode (f32) kernel failed with CUDA error 
code: {} ({})",
-                    ret,
-                    cuda_error_to_string(ret)
-                )));
-            }
-        }
-
-        {
-            crate::profile_scope!("GPU::Synchronize");
-            gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize 
failed")?;
-        }
-
         let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
         Ok(state_vector.to_dlpack())
     }
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs 
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index 68ee26bed..48ba65f84 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -1751,3 +1751,146 @@ fn 
test_encode_basis_from_gpu_ptr_f32_single_sample_success() {
     };
     unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 1, 8) };
 }
+
+// ---- Trait-method tests for `QuantumEncoder::encode_from_gpu_ptr_f32` (PR 
1.5) ----
+//
+// The single-sample f32 method moved onto the `QuantumEncoder` trait in PR 
1.5 so
+// future encoders only need a single override point instead of a standalone 
inherent
+// fn that the dispatcher must remember to call. These tests dispatch through
+// `Encoding::encoder()` to exercise the trait method, not the inherent helper.
+
+#[test]
+fn test_trait_encode_from_gpu_ptr_f32_amplitude() {
+    let Some(_engine) = engine_f32() else {
+        println!("SKIP: No GPU");
+        return;
+    };
+    let num_qubits = 3usize;
+    let state_len = 1usize << num_qubits;
+    let data = common::create_test_data_f32(state_len);
+    let Some((device, data_d)) = common::copy_f32_to_device(data.as_slice()) 
else {
+        println!("SKIP: No CUDA device");
+        return;
+    };
+    let encoder = qdp_core::Encoding::Amplitude.encoder();
+    let state_vector = unsafe {
+        encoder
+            .encode_from_gpu_ptr_f32(
+                &device,
+                *data_d.device_ptr() as *const std::ffi::c_void,
+                state_len,
+                num_qubits,
+                std::ptr::null_mut(),
+            )
+            .expect("trait method should succeed for amplitude")
+    };
+    // Use the engine's own precision conversion so we get a valid dlpack to 
free.
+    let state_vector = state_vector
+        .to_precision(&device, qdp_core::Precision::Float32)
+        .expect("to_precision");
+    unsafe {
+        let dlpack = state_vector.to_dlpack();
+        common::take_deleter_and_delete(dlpack);
+    }
+}
+
+#[test]
+fn test_trait_encode_from_gpu_ptr_f32_angle() {
+    let Some(_engine) = engine_f32() else {
+        println!("SKIP: No GPU");
+        return;
+    };
+    let num_qubits = 3usize;
+    let data = common::create_test_data_f32(num_qubits);
+    let Some((device, data_d)) = common::copy_f32_to_device(data.as_slice()) 
else {
+        println!("SKIP: No CUDA device");
+        return;
+    };
+    let encoder = qdp_core::Encoding::Angle.encoder();
+    let state_vector = unsafe {
+        encoder
+            .encode_from_gpu_ptr_f32(
+                &device,
+                *data_d.device_ptr() as *const std::ffi::c_void,
+                num_qubits,
+                num_qubits,
+                std::ptr::null_mut(),
+            )
+            .expect("trait method should succeed for angle")
+    };
+    let state_vector = state_vector
+        .to_precision(&device, qdp_core::Precision::Float32)
+        .expect("to_precision");
+    unsafe {
+        let dlpack = state_vector.to_dlpack();
+        common::take_deleter_and_delete(dlpack);
+    }
+}
+
+#[test]
+fn test_trait_encode_from_gpu_ptr_f32_basis() {
+    let Some(_engine) = engine_f32() else {
+        println!("SKIP: No GPU");
+        return;
+    };
+    let num_qubits = 3usize;
+    let Some((device, data_d)) = common::copy_f32_to_device(&[5.0_f32]) else {
+        println!("SKIP: No CUDA device");
+        return;
+    };
+    let encoder = qdp_core::Encoding::Basis.encoder();
+    let state_vector = unsafe {
+        encoder
+            .encode_from_gpu_ptr_f32(
+                &device,
+                *data_d.device_ptr() as *const std::ffi::c_void,
+                1,
+                num_qubits,
+                std::ptr::null_mut(),
+            )
+            .expect("trait method should succeed for basis")
+    };
+    let state_vector = state_vector
+        .to_precision(&device, qdp_core::Precision::Float32)
+        .expect("to_precision");
+    unsafe {
+        let dlpack = state_vector.to_dlpack();
+        common::take_deleter_and_delete(dlpack);
+    }
+}
+
+#[test]
+fn test_trait_encode_from_gpu_ptr_f32_default_not_implemented_for_phase() {
+    // The default body returns NotImplemented for encoders that don't 
override.
+    // Phase / IQP / IQP-Z don't currently have an f32 zero-copy path, so the
+    // trait method must fall through to the default rather than mis-dispatch.
+    let Some(_engine) = engine_f32() else {
+        println!("SKIP: No GPU");
+        return;
+    };
+    let num_qubits = 3usize;
+    let Some((device, data_d)) = common::copy_f32_to_device(&[0.1_f32, 0.2, 
0.3]) else {
+        println!("SKIP: No CUDA device");
+        return;
+    };
+    let encoder = qdp_core::Encoding::Phase.encoder();
+    let result = unsafe {
+        encoder.encode_from_gpu_ptr_f32(
+            &device,
+            *data_d.device_ptr() as *const std::ffi::c_void,
+            num_qubits,
+            num_qubits,
+            std::ptr::null_mut(),
+        )
+    };
+    match result {
+        Err(qdp_core::MahoutError::NotImplemented(msg)) => {
+            assert!(
+                msg.contains("encode_from_gpu_ptr_f32") && 
msg.contains("phase"),
+                "unexpected NotImplemented message: {msg}"
+            );
+        }
+        Ok(_) => panic!("phase should not support encode_from_gpu_ptr_f32"),
+        Err(e) => panic!("expected NotImplemented, got {:?}", e),
+    }
+}

Reply via email to