(mahout) branch main updated: [QDP] Extend GpuStateVector to support Float32 precision (#995)

hcr Sat, 31 Jan 2026 07:47:50 -0800

This is an automated email from the ASF dual-hosted git repository.

hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/main by this push:
     new 314260c1e [QDP] Extend GpuStateVector to support Float32 precision 
(#995)
314260c1e is described below

commit 314260c1ebbbfb2ddbd8d8cda5cfd4ca596a7036
Author: Vic Wen <[email protected]>
AuthorDate: Sat Jan 31 23:47:36 2026 +0800

    [QDP] Extend GpuStateVector to support Float32 precision (#995)
    
    * feat: extend GPU state vector allocation with precision support (f32)
    
    * feat: extend GPU state vector precision conversion to support both 
Float64 and Float32
---
 qdp/qdp-core/src/gpu/encodings/amplitude.rs |   2 +-
 qdp/qdp-core/src/gpu/encodings/angle.rs     |   2 +-
 qdp/qdp-core/src/gpu/encodings/basis.rs     |   2 +-
 qdp/qdp-core/src/gpu/encodings/iqp.rs       |   2 +-
 qdp/qdp-core/src/gpu/memory.rs              | 198 ++++++++++++++++++++++------
 qdp/qdp-core/src/lib.rs                     |   4 +-
 qdp/qdp-core/tests/dlpack.rs                |  38 +++++-
 qdp/qdp-kernels/src/amplitude.cu            |  36 +++++
 qdp/qdp-kernels/src/lib.rs                  |  23 ++++
 9 files changed, 257 insertions(+), 50 deletions(-)

diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs 
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index f7846a058..62313550d 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -70,7 +70,7 @@ impl QuantumEncoder for AmplitudeEncoder {
             // Allocate GPU state vector
             let state_vector = {
                 crate::profile_scope!("GPU::Alloc");
-                GpuStateVector::new(_device, num_qubits)?
+                GpuStateVector::new(_device, num_qubits, 
crate::gpu::memory::Precision::Float64)?
             };
 
             // Async Pipeline for large data
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs 
b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 353a9f4c4..2a91cb017 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -63,7 +63,7 @@ impl QuantumEncoder for AngleEncoder {
 
             let state_vector = {
                 crate::profile_scope!("GPU::Alloc");
-                GpuStateVector::new(device, num_qubits)?
+                GpuStateVector::new(device, num_qubits, 
crate::gpu::memory::Precision::Float64)?
             };
 
             let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs 
b/qdp/qdp-core/src/gpu/encodings/basis.rs
index 0b24f9796..e883372f5 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -76,7 +76,7 @@ impl QuantumEncoder for BasisEncoder {
             // Allocate GPU state vector
             let state_vector = {
                 crate::profile_scope!("GPU::Alloc");
-                GpuStateVector::new(device, num_qubits)?
+                GpuStateVector::new(device, num_qubits, 
crate::gpu::memory::Precision::Float64)?
             };
 
             let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/encodings/iqp.rs 
b/qdp/qdp-core/src/gpu/encodings/iqp.rs
index 89d4a8f56..245229a40 100644
--- a/qdp/qdp-core/src/gpu/encodings/iqp.rs
+++ b/qdp/qdp-core/src/gpu/encodings/iqp.rs
@@ -87,7 +87,7 @@ impl QuantumEncoder for IqpEncoder {
 
             let state_vector = {
                 crate::profile_scope!("GPU::Alloc");
-                GpuStateVector::new(device, num_qubits)?
+                GpuStateVector::new(device, num_qubits, 
crate::gpu::memory::Precision::Float64)?
             };
 
             let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs
index 5ee59291a..2bb922c16 100644
--- a/qdp/qdp-core/src/gpu/memory.rs
+++ b/qdp/qdp-core/src/gpu/memory.rs
@@ -196,6 +196,13 @@ impl BufferStorage {
             _ => None,
         }
     }
+
+    fn ptr_f32(&self) -> Option<*mut CuComplex> {
+        match self {
+            BufferStorage::F32(buf) => Some(buf.ptr()),
+            _ => None,
+        }
+    }
 }
 
 /// Quantum state vector on GPU
@@ -220,59 +227,86 @@ unsafe impl Send for GpuStateVector {}
 unsafe impl Sync for GpuStateVector {}
 
 impl GpuStateVector {
-    /// Create GPU state vector for n qubits
-    /// Allocates 2^n complex numbers on GPU (freed on drop)
-    pub fn new(_device: &Arc<CudaDevice>, qubits: usize) -> Result<Self> {
+    /// Create GPU state vector for n qubits with the given precision.
+    /// Allocates 2^n complex numbers (Float32 = CuComplex, Float64 = 
CuDoubleComplex).
+    /// Default for most callers: use `Precision::Float64`.
+    #[cfg(target_os = "linux")]
+    pub fn new(_device: &Arc<CudaDevice>, qubits: usize, precision: Precision) 
-> Result<Self> {
         let _size_elements: usize = 1usize << qubits;
 
-        #[cfg(target_os = "linux")]
-        {
-            let requested_bytes = _size_elements
-                .checked_mul(std::mem::size_of::<CuDoubleComplex>())
-                .ok_or_else(|| {
-                    MahoutError::MemoryAllocation(format!(
-                        "Requested GPU allocation size overflow (elements={})",
-                        _size_elements
-                    ))
-                })?;
+        let buffer = match precision {
+            Precision::Float32 => {
+                let requested_bytes = _size_elements
+                    .checked_mul(std::mem::size_of::<CuComplex>())
+                    .ok_or_else(|| {
+                        MahoutError::MemoryAllocation(format!(
+                            "Requested GPU allocation size overflow 
(elements={})",
+                            _size_elements
+                        ))
+                    })?;
 
-            // Pre-flight check to gracefully fail before cudaMalloc when OOM 
is obvious
-            ensure_device_memory_available(
-                requested_bytes,
-                "state vector allocation",
-                Some(qubits),
-            )?;
+                ensure_device_memory_available(
+                    requested_bytes,
+                    "state vector allocation (f32)",
+                    Some(qubits),
+                )?;
 
-            // Use uninitialized allocation to avoid memory bandwidth waste.
-            // TODO: Consider using a memory pool for input buffers to avoid 
repeated
-            // cudaMalloc overhead in high-frequency encode() calls.
-            let slice =
-                unsafe { _device.alloc::<CuDoubleComplex>(_size_elements) 
}.map_err(|e| {
+                let slice = unsafe { 
_device.alloc::<CuComplex>(_size_elements) }.map_err(|e| {
                     map_allocation_error(
                         requested_bytes,
-                        "state vector allocation",
+                        "state vector allocation (f32)",
                         Some(qubits),
                         e,
                     )
                 })?;
 
-            Ok(Self {
-                buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { slice })),
-                num_qubits: qubits,
-                size_elements: _size_elements,
-                num_samples: None,
-                device_id: _device.ordinal(),
-            })
-        }
+                BufferStorage::F32(GpuBufferRaw { slice })
+            }
+            Precision::Float64 => {
+                let requested_bytes = _size_elements
+                    .checked_mul(std::mem::size_of::<CuDoubleComplex>())
+                    .ok_or_else(|| {
+                        MahoutError::MemoryAllocation(format!(
+                            "Requested GPU allocation size overflow 
(elements={})",
+                            _size_elements
+                        ))
+                    })?;
 
-        #[cfg(not(target_os = "linux"))]
-        {
-            // Non-Linux: compiles but GPU unavailable
-            Err(MahoutError::Cuda(
-                "CUDA is only available on Linux. This build does not support 
GPU operations."
-                    .to_string(),
-            ))
-        }
+                ensure_device_memory_available(
+                    requested_bytes,
+                    "state vector allocation",
+                    Some(qubits),
+                )?;
+
+                let slice =
+                    unsafe { _device.alloc::<CuDoubleComplex>(_size_elements) 
}.map_err(|e| {
+                        map_allocation_error(
+                            requested_bytes,
+                            "state vector allocation",
+                            Some(qubits),
+                            e,
+                        )
+                    })?;
+
+                BufferStorage::F64(GpuBufferRaw { slice })
+            }
+        };
+
+        Ok(Self {
+            buffer: Arc::new(buffer),
+            num_qubits: qubits,
+            size_elements: _size_elements,
+            num_samples: None,
+            device_id: _device.ordinal(),
+        })
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    pub fn new(_device: &Arc<CudaDevice>, _qubits: usize, _precision: 
Precision) -> Result<Self> {
+        Err(MahoutError::Cuda(
+            "CUDA is only available on Linux. This build does not support GPU 
operations."
+                .to_string(),
+        ))
     }
 
     /// Get current precision of the underlying buffer.
@@ -293,6 +327,11 @@ impl GpuStateVector {
         self.buffer.ptr_f64()
     }
 
+    /// Returns a single-precision pointer if the buffer stores complex64 data.
+    pub fn ptr_f32(&self) -> Option<*mut CuComplex> {
+        self.buffer.ptr_f32()
+    }
+
     /// Get the number of qubits
     pub fn num_qubits(&self) -> usize {
         self.num_qubits
@@ -362,13 +401,88 @@ impl GpuStateVector {
 
     /// Convert the state vector to the requested precision (GPU-side).
     ///
-    /// For now only down-conversion from Float64 -> Float32 is supported.
+    /// Supports Float64 -> Float32 and Float32 -> Float64.
     pub fn to_precision(&self, device: &Arc<CudaDevice>, target: Precision) -> 
Result<Self> {
         if self.precision() == target {
             return Ok(self.clone());
         }
 
         match (self.precision(), target) {
+            (Precision::Float32, Precision::Float64) => {
+                #[cfg(target_os = "linux")]
+                {
+                    let requested_bytes = self
+                        .size_elements
+                        .checked_mul(std::mem::size_of::<CuDoubleComplex>())
+                        .ok_or_else(|| {
+                            MahoutError::MemoryAllocation(format!(
+                                "Requested GPU allocation size overflow 
(elements={})",
+                                self.size_elements
+                            ))
+                        })?;
+
+                    ensure_device_memory_available(
+                        requested_bytes,
+                        "state vector precision conversion",
+                        Some(self.num_qubits),
+                    )?;
+
+                    let slice = unsafe { 
device.alloc::<CuDoubleComplex>(self.size_elements) }
+                        .map_err(|e| {
+                            map_allocation_error(
+                                requested_bytes,
+                                "state vector precision conversion",
+                                Some(self.num_qubits),
+                                e,
+                            )
+                        })?;
+
+                    let src_ptr = self.ptr_f32().ok_or_else(|| {
+                        MahoutError::InvalidInput(
+                            "Source state vector is not Float32; cannot 
convert to Float64"
+                                .to_string(),
+                        )
+                    })?;
+
+                    let ret = unsafe {
+                        qdp_kernels::convert_state_to_double(
+                            src_ptr as *const CuComplex,
+                            *slice.device_ptr() as *mut CuDoubleComplex,
+                            self.size_elements,
+                            std::ptr::null_mut(),
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Precision conversion kernel failed: {}",
+                            ret
+                        )));
+                    }
+
+                    device.synchronize().map_err(|e| {
+                        MahoutError::Cuda(format!(
+                            "Failed to sync after precision conversion: {:?}",
+                            e
+                        ))
+                    })?;
+
+                    Ok(Self {
+                        buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { 
slice })),
+                        num_qubits: self.num_qubits,
+                        size_elements: self.size_elements,
+                        num_samples: self.num_samples,
+                        device_id: device.ordinal(),
+                    })
+                }
+
+                #[cfg(not(target_os = "linux"))]
+                {
+                    Err(MahoutError::Cuda(
+                        "Precision conversion requires CUDA 
(Linux)".to_string(),
+                    ))
+                }
+            }
             (Precision::Float64, Precision::Float32) => {
                 #[cfg(target_os = "linux")]
                 {
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 3de648fc9..a8028c62e 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -447,7 +447,7 @@ impl QdpEngine {
 
                 let state_vector = {
                     crate::profile_scope!("GPU::Alloc");
-                    gpu::GpuStateVector::new(&self.device, num_qubits)?
+                    gpu::GpuStateVector::new(&self.device, num_qubits, 
Precision::Float64)?
                 };
 
                 let inv_norm = {
@@ -508,7 +508,7 @@ impl QdpEngine {
 
                 let state_vector = {
                     crate::profile_scope!("GPU::Alloc");
-                    gpu::GpuStateVector::new(&self.device, num_qubits)?
+                    gpu::GpuStateVector::new(&self.device, num_qubits, 
Precision::Float64)?
                 };
 
                 let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/tests/dlpack.rs b/qdp/qdp-core/tests/dlpack.rs
index 6b97283ce..3c039b371 100644
--- a/qdp/qdp-core/tests/dlpack.rs
+++ b/qdp/qdp-core/tests/dlpack.rs
@@ -21,6 +21,7 @@ mod dlpack_tests {
     use std::ffi::c_void;
 
     use cudarc::driver::CudaDevice;
+    use qdp_core::Precision;
     use qdp_core::dlpack::{CUDA_STREAM_LEGACY, synchronize_stream};
     use qdp_core::gpu::memory::GpuStateVector;
 
@@ -59,8 +60,8 @@ mod dlpack_tests {
         let device = CudaDevice::new(0).unwrap();
 
         let num_qubits = 2;
-        let state_vector =
-            GpuStateVector::new(&device, num_qubits).expect("Failed to create 
state vector");
+        let state_vector = GpuStateVector::new(&device, num_qubits, 
Precision::Float64)
+            .expect("Failed to create state vector");
 
         let dlpack_ptr = state_vector.to_dlpack();
         assert!(!dlpack_ptr.is_null());
@@ -86,6 +87,39 @@ mod dlpack_tests {
         }
     }
 
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn test_dlpack_single_shape_f32() {
+        let device = CudaDevice::new(0).unwrap();
+
+        let num_qubits = 2;
+        let state_vector = GpuStateVector::new(&device, num_qubits, 
Precision::Float32)
+            .expect("Failed to create Float32 state vector");
+
+        assert!(
+            state_vector.ptr_f32().is_some(),
+            "Float32 state vector should have ptr_f32()"
+        );
+        assert!(
+            state_vector.ptr_f64().is_none(),
+            "Float32 state vector should not have ptr_f64()"
+        );
+
+        let dlpack_ptr = state_vector.to_dlpack();
+        assert!(!dlpack_ptr.is_null());
+
+        unsafe {
+            let tensor = &(*dlpack_ptr).dl_tensor;
+            assert_eq!(tensor.ndim, 2, "DLPack tensor should be 2D");
+            let shape = std::slice::from_raw_parts(tensor.shape, 2);
+            assert_eq!(shape[0], 1);
+            assert_eq!(shape[1], (1 << num_qubits) as i64);
+            if let Some(deleter) = (*dlpack_ptr).deleter {
+                deleter(dlpack_ptr);
+            }
+        }
+    }
+
     /// synchronize_stream(null) is a no-op and returns Ok(()) on all 
platforms.
     #[test]
     fn test_synchronize_stream_null() {
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index aa21059e4..06676e081 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -777,6 +777,42 @@ int convert_state_to_float(
     return (int)cudaGetLastError();
 }
 
+/// Kernel: convert complex64 state vector to complex128.
+__global__ void convert_state_to_complex128_kernel(
+    const cuComplex* __restrict__ input_state,
+    cuDoubleComplex* __restrict__ output_state,
+    size_t len
+) {
+    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= len) return;
+
+    const cuComplex v = input_state[idx];
+    output_state[idx] = make_cuDoubleComplex((double)v.x, (double)v.y);
+}
+
+/// Launch conversion kernel from complex64 to complex128.
+int convert_state_to_double(
+    const cuComplex* input_state_d,
+    cuDoubleComplex* output_state_d,
+    size_t len,
+    cudaStream_t stream
+) {
+    if (len == 0) {
+        return cudaErrorInvalidValue;
+    }
+
+    const int blockSize = DEFAULT_BLOCK_SIZE;
+    const int gridSize = (int)((len + blockSize - 1) / blockSize);
+
+    convert_state_to_complex128_kernel<<<gridSize, blockSize, 0, stream>>>(
+        input_state_d,
+        output_state_d,
+        len
+    );
+
+    return (int)cudaGetLastError();
+}
+
 // TODO: Future encoding methods:
 // - launch_angle_encode (angle encoding)
 // - launch_iqp_encode (IQP encoding)
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index e4d0b9976..2bbd21609 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -160,6 +160,18 @@ unsafe extern "C" {
         stream: *mut c_void,
     ) -> i32;
 
+    /// Convert a complex64 state vector to complex128 on GPU.
+    /// Returns CUDA error code (0 = success).
+    ///
+    /// # Safety
+    /// Pointers must reference valid device memory on the provided stream.
+    pub fn convert_state_to_double(
+        input_state_d: *const CuComplex,
+        output_state_d: *mut CuDoubleComplex,
+        len: usize,
+        stream: *mut c_void,
+    ) -> i32;
+
     /// Launch basis encoding kernel
     /// Maps an integer index to a computational basis state.
     /// Returns CUDA error code (0 = success)
@@ -343,6 +355,17 @@ pub extern "C" fn convert_state_to_float(
     999
 }
 
+#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[unsafe(no_mangle)]
+pub extern "C" fn convert_state_to_double(
+    _input_state_d: *const CuComplex,
+    _output_state_d: *mut CuDoubleComplex,
+    _len: usize,
+    _stream: *mut c_void,
+) -> i32 {
+    999
+}
+
 #[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_basis_encode(

(mahout) branch main updated: [QDP] Extend GpuStateVector to support Float32 precision (#995)

Reply via email to