This is an automated email from the ASF dual-hosted git repository.
hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new d052d4e02 MAHOUT-878 Add CUDA Torch Tensor Support for QDP Python
Binding (#881)
d052d4e02 is described below
commit d052d4e02160c28fce4a82e2a53a3deebb66bfd5
Author: Ryan Huang <[email protected]>
AuthorDate: Thu Jan 22 14:52:33 2026 +0800
MAHOUT-878 Add CUDA Torch Tensor Support for QDP Python Binding (#881)
* Add CUDA Tensor Support for QDP Python Binding
* Refactor CUDA tensor handling to extract information directly from
PyTorch tensors
* Enhance error reporting for CUDA kernel launches by including error
descriptions
* linter
---
qdp/qdp-core/src/gpu/encodings/amplitude.rs | 31 ++-
qdp/qdp-core/src/lib.rs | 268 +++++++++++++++++++++++-
qdp/qdp-python/src/lib.rs | 174 +++++++++++++++-
testing/qdp/test_bindings.py | 312 +++++++++++++++++++++++++++-
4 files changed, 772 insertions(+), 13 deletions(-)
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index f4b8abd75..0720cd619 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -95,11 +95,15 @@ impl QuantumEncoder for AmplitudeEncoder {
// GPU-accelerated norm for medium+ inputs, CPU fallback for
tiny payloads
let inv_norm = if host_data.len() >= GPU_NORM_THRESHOLD {
- Self::calculate_inv_norm_gpu(
- _device,
- *input_slice.device_ptr() as *const f64,
- host_data.len(),
- )?
+ // SAFETY: input_slice was just allocated and copied from
host_data,
+ // so the pointer is valid and contains host_data.len()
elements
+ unsafe {
+ Self::calculate_inv_norm_gpu(
+ _device,
+ *input_slice.device_ptr() as *const f64,
+ host_data.len(),
+ )?
+ }
} else {
let norm = Preprocessor::calculate_l2_norm(host_data)?;
1.0 / norm
@@ -411,8 +415,20 @@ impl AmplitudeEncoder {
impl AmplitudeEncoder {
/// Compute inverse L2 norm on GPU using the reduction kernel.
+ ///
+ /// # Arguments
+ /// * `device` - CUDA device reference
+ /// * `input_ptr` - Device pointer to input data (f64 array on GPU)
+ /// * `len` - Number of f64 elements
+ ///
+ /// # Returns
+ /// The inverse L2 norm (1/||x||_2) of the input data
+ ///
+ /// # Safety
+ /// The caller must ensure `input_ptr` points to valid GPU memory
containing
+ /// at least `len` f64 elements on the same device as `device`.
#[cfg(target_os = "linux")]
- fn calculate_inv_norm_gpu(
+ pub(crate) unsafe fn calculate_inv_norm_gpu(
device: &Arc<CudaDevice>,
input_ptr: *const f64,
len: usize,
@@ -447,7 +463,8 @@ impl AmplitudeEncoder {
let inv_norm = inv_norm_host.first().copied().unwrap_or(0.0);
if inv_norm == 0.0 || !inv_norm.is_finite() {
return Err(MahoutError::InvalidInput(
- "Input data has zero norm".to_string(),
+ "Input data has zero or non-finite norm (contains NaN, Inf, or
all zeros)"
+ .to_string(),
));
}
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index c5bbcf19e..f0bedb73a 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -28,7 +28,7 @@ pub mod tf_proto;
#[macro_use]
mod profiling;
-pub use error::{MahoutError, Result};
+pub use error::{MahoutError, Result, cuda_error_to_string};
pub use gpu::memory::Precision;
use std::sync::Arc;
@@ -300,6 +300,272 @@ impl QdpEngine {
encoding_method,
)
}
+
+ /// Encode from existing GPU pointer (zero-copy for CUDA tensors)
+ ///
+ /// This method enables zero-copy encoding from PyTorch CUDA tensors by
accepting
+ /// a raw GPU pointer directly, avoiding the GPU→CPU→GPU copy that would
otherwise
+ /// be required.
+ ///
+ /// TODO: Refactor to use QuantumEncoder trait (add `encode_from_gpu_ptr`
to trait)
+ /// to reduce duplication with AmplitudeEncoder::encode(). This would also
make it
+ /// easier to add GPU pointer support for other encoders (angle, basis) in
the future.
+ ///
+ /// # Arguments
+ /// * `input_d` - Device pointer to input data (f64 array on GPU)
+ /// * `input_len` - Number of f64 elements in the input
+ /// * `num_qubits` - Number of qubits for encoding
+ /// * `encoding_method` - Strategy (currently only "amplitude" supported)
+ ///
+ /// # Returns
+ /// DLPack pointer for zero-copy PyTorch integration
+ ///
+ /// # Safety
+ /// The input pointer must:
+ /// - Point to valid GPU memory on the same device as the engine
+ /// - Contain at least `input_len` f64 elements
+ /// - Remain valid for the duration of this call
+ #[cfg(target_os = "linux")]
+ pub unsafe fn encode_from_gpu_ptr(
+ &self,
+ input_d: *const f64,
+ input_len: usize,
+ num_qubits: usize,
+ encoding_method: &str,
+ ) -> Result<*mut DLManagedTensor> {
+ crate::profile_scope!("Mahout::EncodeFromGpuPtr");
+
+ if encoding_method != "amplitude" {
+ return Err(MahoutError::NotImplemented(format!(
+ "GPU pointer encoding currently only supports 'amplitude'
method, got '{}'",
+ encoding_method
+ )));
+ }
+
+ if input_len == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Input data cannot be empty".into(),
+ ));
+ }
+
+ let state_len = 1usize << num_qubits;
+ if input_len > state_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "Input size {} exceeds state vector size {} (2^{} qubits)",
+ input_len, state_len, num_qubits
+ )));
+ }
+
+ // Allocate output state vector
+ let state_vector = {
+ crate::profile_scope!("GPU::Alloc");
+ gpu::GpuStateVector::new(&self.device, num_qubits)?
+ };
+
+ // Compute inverse L2 norm on GPU
+ let inv_norm = {
+ crate::profile_scope!("GPU::NormFromPtr");
+ // SAFETY: input_d validity is guaranteed by the caller's safety
contract
+ unsafe {
+ gpu::AmplitudeEncoder::calculate_inv_norm_gpu(&self.device,
input_d, input_len)?
+ }
+ };
+
+ // Get output pointer
+ let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+
+ // Launch encoding kernel
+ {
+ crate::profile_scope!("GPU::KernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_amplitude_encode(
+ input_d,
+ state_ptr as *mut std::ffi::c_void,
+ input_len,
+ state_len,
+ inv_norm,
+ std::ptr::null_mut(), // default stream
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Amplitude encode kernel failed with CUDA error code: {}
({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ // Synchronize
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ self.device.synchronize().map_err(|e| {
+ MahoutError::Cuda(format!("CUDA device synchronize failed:
{:?}", e))
+ })?;
+ }
+
+ let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(state_vector.to_dlpack())
+ }
+
+ /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
+ ///
+ /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
+ ///
+ /// TODO: Refactor to use QuantumEncoder trait (see `encode_from_gpu_ptr`
TODO).
+ ///
+ /// # Arguments
+ /// * `input_batch_d` - Device pointer to batch input data (flattened f64
array on GPU)
+ /// * `num_samples` - Number of samples in the batch
+ /// * `sample_size` - Size of each sample in f64 elements
+ /// * `num_qubits` - Number of qubits for encoding
+ /// * `encoding_method` - Strategy (currently only "amplitude" supported)
+ ///
+ /// # Returns
+ /// Single DLPack pointer containing all encoded states (shape:
[num_samples, 2^num_qubits])
+ ///
+ /// # Safety
+ /// The input pointer must:
+ /// - Point to valid GPU memory on the same device as the engine
+ /// - Contain at least `num_samples * sample_size` f64 elements
+ /// - Remain valid for the duration of this call
+ #[cfg(target_os = "linux")]
+ pub unsafe fn encode_batch_from_gpu_ptr(
+ &self,
+ input_batch_d: *const f64,
+ num_samples: usize,
+ sample_size: usize,
+ num_qubits: usize,
+ encoding_method: &str,
+ ) -> Result<*mut DLManagedTensor> {
+ crate::profile_scope!("Mahout::EncodeBatchFromGpuPtr");
+
+ if encoding_method != "amplitude" {
+ return Err(MahoutError::NotImplemented(format!(
+ "GPU pointer batch encoding currently only supports
'amplitude' method, got '{}'",
+ encoding_method
+ )));
+ }
+
+ if num_samples == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Number of samples cannot be zero".into(),
+ ));
+ }
+
+ if sample_size == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Sample size cannot be zero".into(),
+ ));
+ }
+
+ let state_len = 1usize << num_qubits;
+ if sample_size > state_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "Sample size {} exceeds state vector size {} (2^{} qubits)",
+ sample_size, state_len, num_qubits
+ )));
+ }
+
+ // Allocate output state vector
+ let batch_state_vector = {
+ crate::profile_scope!("GPU::AllocBatch");
+ gpu::GpuStateVector::new_batch(&self.device, num_samples,
num_qubits)?
+ };
+
+ // Compute inverse norms on GPU using warp-reduced kernel
+ let inv_norms_gpu = {
+ crate::profile_scope!("GPU::BatchNormKernel");
+ use cudarc::driver::DevicePtrMut;
+
+ let mut buffer =
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+ MahoutError::MemoryAllocation(format!("Failed to allocate norm
buffer: {:?}", e))
+ })?;
+
+ let ret = unsafe {
+ qdp_kernels::launch_l2_norm_batch(
+ input_batch_d,
+ num_samples,
+ sample_size,
+ *buffer.device_ptr_mut() as *mut f64,
+ std::ptr::null_mut(), // default stream
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Norm reduction kernel failed with CUDA error code: {}
({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+
+ buffer
+ };
+
+ // Validate norms on host to catch zero or NaN samples early
+ {
+ crate::profile_scope!("GPU::NormValidation");
+ let host_inv_norms = self
+ .device
+ .dtoh_sync_copy(&inv_norms_gpu)
+ .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms
to host: {:?}", e)))?;
+
+ if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+ return Err(MahoutError::InvalidInput(
+ "One or more samples have zero or invalid
norm".to_string(),
+ ));
+ }
+ }
+
+ // Launch batch kernel
+ {
+ crate::profile_scope!("GPU::BatchKernelLaunch");
+ use cudarc::driver::DevicePtr;
+
+ let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "Batch state vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+
+ let ret = unsafe {
+ qdp_kernels::launch_amplitude_encode_batch(
+ input_batch_d,
+ state_ptr as *mut std::ffi::c_void,
+ *inv_norms_gpu.device_ptr() as *const f64,
+ num_samples,
+ sample_size,
+ state_len,
+ std::ptr::null_mut(), // default stream
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Batch kernel launch failed with CUDA error code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ // Synchronize
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ self.device
+ .synchronize()
+ .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}",
e)))?;
+ }
+
+ let batch_state_vector = batch_state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(batch_state_vector.to_dlpack())
+ }
}
// Re-export key types for convenience
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs
index fbd5c91cb..016ee1259 100644
--- a/qdp/qdp-python/src/lib.rs
+++ b/qdp/qdp-python/src/lib.rs
@@ -152,7 +152,7 @@ fn is_pytorch_tensor(obj: &Bound<'_, PyAny>) ->
PyResult<bool> {
Ok(module_name == "torch")
}
-/// Helper to validate tensor
+/// Helper to validate CPU tensor
fn validate_tensor(tensor: &Bound<'_, PyAny>) -> PyResult<()> {
if !is_pytorch_tensor(tensor)? {
return Err(PyRuntimeError::new_err("Object is not a PyTorch Tensor"));
@@ -171,6 +171,106 @@ fn validate_tensor(tensor: &Bound<'_, PyAny>) ->
PyResult<()> {
Ok(())
}
+/// Check if a PyTorch tensor is on a CUDA device
+fn is_cuda_tensor(tensor: &Bound<'_, PyAny>) -> PyResult<bool> {
+ let device = tensor.getattr("device")?;
+ let device_type: String = device.getattr("type")?.extract()?;
+ Ok(device_type == "cuda")
+}
+
+/// Get the CUDA device index from a PyTorch tensor
+fn get_tensor_device_id(tensor: &Bound<'_, PyAny>) -> PyResult<i32> {
+ let device = tensor.getattr("device")?;
+ let device_index: i32 = device.getattr("index")?.extract()?;
+ Ok(device_index)
+}
+
+/// Validate a CUDA tensor for direct GPU encoding
+/// Checks: dtype=float64, contiguous, non-empty, device_id matches engine
+fn validate_cuda_tensor_for_encoding(
+ tensor: &Bound<'_, PyAny>,
+ expected_device_id: usize,
+ encoding_method: &str,
+) -> PyResult<()> {
+ // Check encoding method support (currently only amplitude is supported
for CUDA tensors)
+ if encoding_method != "amplitude" {
+ return Err(PyRuntimeError::new_err(format!(
+ "CUDA tensor encoding currently only supports 'amplitude' method,
got '{}'. \
+ Use tensor.cpu() to convert to CPU tensor for other encoding
methods.",
+ encoding_method
+ )));
+ }
+
+ // Check dtype is float64
+ let dtype = tensor.getattr("dtype")?;
+ let dtype_str: String = dtype.str()?.extract()?;
+ if !dtype_str.contains("float64") {
+ return Err(PyRuntimeError::new_err(format!(
+ "CUDA tensor must have dtype float64, got {}. Use
tensor.to(torch.float64)",
+ dtype_str
+ )));
+ }
+
+ // Check contiguous
+ let is_contiguous: bool = tensor.call_method0("is_contiguous")?.extract()?;
+ if !is_contiguous {
+ return Err(PyRuntimeError::new_err(
+ "CUDA tensor must be contiguous. Use tensor.contiguous()",
+ ));
+ }
+
+ // Check non-empty
+ let numel: usize = tensor.call_method0("numel")?.extract()?;
+ if numel == 0 {
+ return Err(PyRuntimeError::new_err("CUDA tensor cannot be empty"));
+ }
+
+ // Check device matches engine
+ let tensor_device_id = get_tensor_device_id(tensor)?;
+ if tensor_device_id as usize != expected_device_id {
+ return Err(PyRuntimeError::new_err(format!(
+ "Device mismatch: tensor is on cuda:{}, but engine is on cuda:{}. \
+ Move tensor with tensor.to('cuda:{}')",
+ tensor_device_id, expected_device_id, expected_device_id
+ )));
+ }
+
+ Ok(())
+}
+
+/// CUDA tensor information extracted directly from PyTorch tensor
+struct CudaTensorInfo {
+ data_ptr: *const f64,
+ shape: Vec<i64>,
+}
+
+/// Extract GPU pointer directly from PyTorch CUDA tensor
+///
+/// Uses PyTorch's `data_ptr()` and `shape` APIs directly instead of DLPack
protocol.
+/// This avoids the DLPack capsule lifecycle complexity and potential memory
leaks
+/// from the capsule renaming pattern.
+///
+/// # Safety
+/// The returned `data_ptr` points to GPU memory owned by the source tensor.
+/// The caller must ensure the source tensor remains alive and unmodified
+/// for the entire duration that `data_ptr` is in use. Python's GIL ensures
+/// the tensor won't be garbage collected during `encode()`, but the caller
+/// must not deallocate or resize the tensor while encoding is in progress.
+fn extract_cuda_tensor_info(tensor: &Bound<'_, PyAny>) ->
PyResult<CudaTensorInfo> {
+ // Get GPU pointer directly via tensor.data_ptr()
+ let data_ptr_int: isize = tensor.call_method0("data_ptr")?.extract()?;
+ if data_ptr_int == 0 {
+ return Err(PyRuntimeError::new_err("CUDA tensor has null data
pointer"));
+ }
+ let data_ptr = data_ptr_int as *const f64;
+
+ // Get shape directly via tensor.shape
+ let shape_obj = tensor.getattr("shape")?;
+ let shape: Vec<i64> = shape_obj.extract()?;
+
+ Ok(CudaTensorInfo { data_ptr, shape })
+}
+
/// PyO3 wrapper for QdpEngine
///
/// Provides Python bindings for GPU-accelerated quantum state encoding.
@@ -321,6 +421,78 @@ impl QdpEngine {
// Check if it's a PyTorch tensor
if is_pytorch_tensor(data)? {
+ // Check if it's a CUDA tensor - use zero-copy GPU encoding
+ if is_cuda_tensor(data)? {
+ // Validate CUDA tensor for direct GPU encoding
+ validate_cuda_tensor_for_encoding(
+ data,
+ self.engine.device().ordinal(),
+ encoding_method,
+ )?;
+
+ // Extract GPU pointer directly from PyTorch tensor
+ let tensor_info = extract_cuda_tensor_info(data)?;
+
+ let ndim: usize = data.call_method0("dim")?.extract()?;
+
+ match ndim {
+ 1 => {
+ // 1D CUDA tensor: single sample encoding
+ let input_len = tensor_info.shape[0] as usize;
+ // SAFETY: tensor_info.data_ptr was obtained via
PyTorch's data_ptr() from a
+ // valid CUDA tensor. The tensor remains alive during
this call
+ // (held by Python's GIL), and we validated
dtype/contiguity/device above.
+ let ptr = unsafe {
+ self.engine
+ .encode_from_gpu_ptr(
+ tensor_info.data_ptr,
+ input_len,
+ num_qubits,
+ encoding_method,
+ )
+ .map_err(|e| {
+ PyRuntimeError::new_err(format!("Encoding
failed: {}", e))
+ })?
+ };
+ return Ok(QuantumTensor {
+ ptr,
+ consumed: false,
+ });
+ }
+ 2 => {
+ // 2D CUDA tensor: batch encoding
+ let num_samples = tensor_info.shape[0] as usize;
+ let sample_size = tensor_info.shape[1] as usize;
+ // SAFETY: Same as above - pointer from validated
PyTorch CUDA tensor
+ let ptr = unsafe {
+ self.engine
+ .encode_batch_from_gpu_ptr(
+ tensor_info.data_ptr,
+ num_samples,
+ sample_size,
+ num_qubits,
+ encoding_method,
+ )
+ .map_err(|e| {
+ PyRuntimeError::new_err(format!("Encoding
failed: {}", e))
+ })?
+ };
+ return Ok(QuantumTensor {
+ ptr,
+ consumed: false,
+ });
+ }
+ _ => {
+ return Err(PyRuntimeError::new_err(format!(
+ "Unsupported CUDA tensor shape: {}D. Expected 1D
tensor for single \
+ sample encoding or 2D tensor (batch_size,
features) for batch encoding.",
+ ndim
+ )));
+ }
+ }
+ }
+
+ // CPU tensor path (existing code)
validate_tensor(data)?;
// PERF: Avoid Tensor -> Python list -> Vec deep copies.
//
diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py
index 64bf09727..590e3ec63 100644
--- a/testing/qdp/test_bindings.py
+++ b/testing/qdp/test_bindings.py
@@ -253,10 +253,314 @@ def test_encode_errors():
with pytest.raises(RuntimeError, match="Unsupported data type"):
engine.encode({"key": "value"}, 2, "amplitude")
- # Test GPU tensor input (should fail as only CPU is supported)
- gpu_tensor = torch.tensor([1.0, 2.0], device="cuda:0")
- with pytest.raises(RuntimeError, match="Only CPU tensors are currently
supported"):
- engine.encode(gpu_tensor, 1, "amplitude")
+
[email protected]
+def test_encode_cuda_tensor_1d():
+ """Test encoding from 1D CUDA tensor (single sample, zero-copy)."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create 1D CUDA tensor
+ data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float64,
device="cuda:0")
+ qtensor = engine.encode(data, 2, "amplitude")
+
+ # Verify result
+ result = torch.from_dlpack(qtensor)
+ assert result.is_cuda
+ assert result.shape == (1, 4) # 2^2 = 4 amplitudes
+
+ # Verify normalization (amplitudes should have unit norm)
+ norm = torch.sqrt(torch.sum(torch.abs(result) ** 2))
+ assert torch.isclose(norm, torch.tensor(1.0, device="cuda:0"), atol=1e-6)
+
+
[email protected]
+def test_encode_cuda_tensor_2d_batch():
+ """Test encoding from 2D CUDA tensor (batch, zero-copy)."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create 2D CUDA tensor (batch_size=3, features=4)
+ data = torch.tensor(
+ [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]],
+ dtype=torch.float64,
+ device="cuda:0",
+ )
+ qtensor = engine.encode(data, 2, "amplitude")
+
+ # Verify result
+ result = torch.from_dlpack(qtensor)
+ assert result.is_cuda
+ assert result.shape == (3, 4) # batch_size=3, 2^2=4
+
+ # Verify each sample is normalized
+ for i in range(3):
+ norm = torch.sqrt(torch.sum(torch.abs(result[i]) ** 2))
+ assert torch.isclose(norm, torch.tensor(1.0, device="cuda:0"),
atol=1e-6)
+
+
[email protected]
+def test_encode_cuda_tensor_wrong_dtype():
+ """Test error when CUDA tensor has wrong dtype (non-float64)."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create CUDA tensor with float32 dtype (wrong)
+ data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32,
device="cuda:0")
+ with pytest.raises(RuntimeError, match="CUDA tensor must have dtype
float64"):
+ engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_non_contiguous():
+ """Test error when CUDA tensor is non-contiguous."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create non-contiguous CUDA tensor (via transpose)
+ data = torch.tensor(
+ [[1.0, 2.0], [3.0, 4.0]], dtype=torch.float64, device="cuda:0"
+ ).t()
+ assert not data.is_contiguous()
+
+ with pytest.raises(RuntimeError, match="CUDA tensor must be contiguous"):
+ engine.encode(data, 2, "amplitude")
+
+
[email protected]
[email protected](
+ not _has_multi_gpu(), reason="Multi-GPU setup required for this test"
+)
+def test_encode_cuda_tensor_device_mismatch():
+ """Test error when CUDA tensor is on wrong device (multi-GPU only)."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ # Engine on device 0
+ engine = QdpEngine(0)
+
+ # Tensor on device 1 (wrong device)
+ data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float64,
device="cuda:1")
+ with pytest.raises(RuntimeError, match="Device mismatch"):
+ engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_empty():
+ """Test error when CUDA tensor is empty."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create empty CUDA tensor
+ data = torch.tensor([], dtype=torch.float64, device="cuda:0")
+ with pytest.raises(RuntimeError, match="CUDA tensor cannot be empty"):
+ engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_preserves_input():
+ """Test that input CUDA tensor is not modified after encoding."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create CUDA tensor and save a copy
+ original_data = [1.0, 2.0, 3.0, 4.0]
+ data = torch.tensor(original_data, dtype=torch.float64, device="cuda:0")
+ data_clone = data.clone()
+
+ # Encode
+ qtensor = engine.encode(data, 2, "amplitude")
+ _ = torch.from_dlpack(qtensor)
+
+ # Verify original tensor is unchanged
+ assert torch.equal(data, data_clone)
+
+
[email protected]
+def test_encode_cuda_tensor_unsupported_encoding():
+ """Test error when using CUDA tensor with unsupported encoding method."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # CUDA tensors currently only support amplitude encoding
+ # Use non-zero data to avoid normalization issues
+ data = torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float64,
device="cuda:0")
+
+ with pytest.raises(RuntimeError, match="only supports 'amplitude' method"):
+ engine.encode(data, 2, "basis")
+
+ with pytest.raises(RuntimeError, match="only supports 'amplitude' method"):
+ engine.encode(data, 2, "angle")
+
+
[email protected]
+def test_encode_cuda_tensor_3d_rejected():
+ """Test error when CUDA tensor has 3+ dimensions."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create 3D CUDA tensor (should be rejected)
+ data = torch.randn(2, 3, 4, dtype=torch.float64, device="cuda:0")
+ with pytest.raises(RuntimeError, match="Unsupported CUDA tensor shape:
3D"):
+ engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_zero_values():
+ """Test error when CUDA tensor contains all zeros (zero norm)."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create CUDA tensor with all zeros (cannot be normalized)
+ data = torch.zeros(4, dtype=torch.float64, device="cuda:0")
+ with pytest.raises(RuntimeError, match="zero or non-finite norm"):
+ engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_nan_values():
+ """Test error when CUDA tensor contains NaN values."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create CUDA tensor with NaN
+ data = torch.tensor(
+ [1.0, float("nan"), 3.0, 4.0], dtype=torch.float64, device="cuda:0"
+ )
+ with pytest.raises(RuntimeError, match="zero or non-finite norm"):
+ engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_inf_values():
+ """Test error when CUDA tensor contains Inf values."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create CUDA tensor with Inf
+ data = torch.tensor(
+ [1.0, float("inf"), 3.0, 4.0], dtype=torch.float64, device="cuda:0"
+ )
+ with pytest.raises(RuntimeError, match="zero or non-finite norm"):
+ engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_output_dtype():
+ """Test that CUDA tensor encoding produces correct output dtype."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ # Test default precision (float32 -> complex64)
+ engine_f32 = QdpEngine(0, precision="float32")
+ data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float64,
device="cuda:0")
+ result = torch.from_dlpack(engine_f32.encode(data, 2, "amplitude"))
+ assert result.dtype == torch.complex64, f"Expected complex64, got
{result.dtype}"
+
+ # Test float64 precision (float64 -> complex128)
+ engine_f64 = QdpEngine(0, precision="float64")
+ data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float64,
device="cuda:0")
+ result = torch.from_dlpack(engine_f64.encode(data, 2, "amplitude"))
+ assert result.dtype == torch.complex128, f"Expected complex128, got
{result.dtype}"
+
+
[email protected]
+def test_encode_cuda_tensor_preserves_input_batch():
+ """Test that input 2D CUDA tensor (batch) is not modified after
encoding."""
+ pytest.importorskip("torch")
+ import torch
+ from _qdp import QdpEngine
+
+ if not torch.cuda.is_available():
+ pytest.skip("GPU required for QdpEngine")
+
+ engine = QdpEngine(0)
+
+ # Create 2D CUDA tensor and save a copy
+ data = torch.tensor(
+ [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]],
+ dtype=torch.float64,
+ device="cuda:0",
+ )
+ data_clone = data.clone()
+
+ # Encode
+ qtensor = engine.encode(data, 2, "amplitude")
+ _ = torch.from_dlpack(qtensor)
+
+ # Verify original tensor is unchanged
+ assert torch.equal(data, data_clone)
@pytest.mark.gpu