This is an automated email from the ASF dual-hosted git repository.
hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 314260c1e [QDP] Extend GpuStateVector to support Float32 precision
(#995)
314260c1e is described below
commit 314260c1ebbbfb2ddbd8d8cda5cfd4ca596a7036
Author: Vic Wen <[email protected]>
AuthorDate: Sat Jan 31 23:47:36 2026 +0800
[QDP] Extend GpuStateVector to support Float32 precision (#995)
* feat: extend GPU state vector allocation with precision support (f32)
* feat: extend GPU state vector precision conversion to support both
Float64 and Float32
---
qdp/qdp-core/src/gpu/encodings/amplitude.rs | 2 +-
qdp/qdp-core/src/gpu/encodings/angle.rs | 2 +-
qdp/qdp-core/src/gpu/encodings/basis.rs | 2 +-
qdp/qdp-core/src/gpu/encodings/iqp.rs | 2 +-
qdp/qdp-core/src/gpu/memory.rs | 198 ++++++++++++++++++++++------
qdp/qdp-core/src/lib.rs | 4 +-
qdp/qdp-core/tests/dlpack.rs | 38 +++++-
qdp/qdp-kernels/src/amplitude.cu | 36 +++++
qdp/qdp-kernels/src/lib.rs | 23 ++++
9 files changed, 257 insertions(+), 50 deletions(-)
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index f7846a058..62313550d 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -70,7 +70,7 @@ impl QuantumEncoder for AmplitudeEncoder {
// Allocate GPU state vector
let state_vector = {
crate::profile_scope!("GPU::Alloc");
- GpuStateVector::new(_device, num_qubits)?
+ GpuStateVector::new(_device, num_qubits,
crate::gpu::memory::Precision::Float64)?
};
// Async Pipeline for large data
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs
b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 353a9f4c4..2a91cb017 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -63,7 +63,7 @@ impl QuantumEncoder for AngleEncoder {
let state_vector = {
crate::profile_scope!("GPU::Alloc");
- GpuStateVector::new(device, num_qubits)?
+ GpuStateVector::new(device, num_qubits,
crate::gpu::memory::Precision::Float64)?
};
let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs
b/qdp/qdp-core/src/gpu/encodings/basis.rs
index 0b24f9796..e883372f5 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -76,7 +76,7 @@ impl QuantumEncoder for BasisEncoder {
// Allocate GPU state vector
let state_vector = {
crate::profile_scope!("GPU::Alloc");
- GpuStateVector::new(device, num_qubits)?
+ GpuStateVector::new(device, num_qubits,
crate::gpu::memory::Precision::Float64)?
};
let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/encodings/iqp.rs
b/qdp/qdp-core/src/gpu/encodings/iqp.rs
index 89d4a8f56..245229a40 100644
--- a/qdp/qdp-core/src/gpu/encodings/iqp.rs
+++ b/qdp/qdp-core/src/gpu/encodings/iqp.rs
@@ -87,7 +87,7 @@ impl QuantumEncoder for IqpEncoder {
let state_vector = {
crate::profile_scope!("GPU::Alloc");
- GpuStateVector::new(device, num_qubits)?
+ GpuStateVector::new(device, num_qubits,
crate::gpu::memory::Precision::Float64)?
};
let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs
index 5ee59291a..2bb922c16 100644
--- a/qdp/qdp-core/src/gpu/memory.rs
+++ b/qdp/qdp-core/src/gpu/memory.rs
@@ -196,6 +196,13 @@ impl BufferStorage {
_ => None,
}
}
+
+ fn ptr_f32(&self) -> Option<*mut CuComplex> {
+ match self {
+ BufferStorage::F32(buf) => Some(buf.ptr()),
+ _ => None,
+ }
+ }
}
/// Quantum state vector on GPU
@@ -220,59 +227,86 @@ unsafe impl Send for GpuStateVector {}
unsafe impl Sync for GpuStateVector {}
impl GpuStateVector {
- /// Create GPU state vector for n qubits
- /// Allocates 2^n complex numbers on GPU (freed on drop)
- pub fn new(_device: &Arc<CudaDevice>, qubits: usize) -> Result<Self> {
+ /// Create GPU state vector for n qubits with the given precision.
+ /// Allocates 2^n complex numbers (Float32 = CuComplex, Float64 =
CuDoubleComplex).
+ /// Default for most callers: use `Precision::Float64`.
+ #[cfg(target_os = "linux")]
+ pub fn new(_device: &Arc<CudaDevice>, qubits: usize, precision: Precision)
-> Result<Self> {
let _size_elements: usize = 1usize << qubits;
- #[cfg(target_os = "linux")]
- {
- let requested_bytes = _size_elements
- .checked_mul(std::mem::size_of::<CuDoubleComplex>())
- .ok_or_else(|| {
- MahoutError::MemoryAllocation(format!(
- "Requested GPU allocation size overflow (elements={})",
- _size_elements
- ))
- })?;
+ let buffer = match precision {
+ Precision::Float32 => {
+ let requested_bytes = _size_elements
+ .checked_mul(std::mem::size_of::<CuComplex>())
+ .ok_or_else(|| {
+ MahoutError::MemoryAllocation(format!(
+ "Requested GPU allocation size overflow
(elements={})",
+ _size_elements
+ ))
+ })?;
- // Pre-flight check to gracefully fail before cudaMalloc when OOM
is obvious
- ensure_device_memory_available(
- requested_bytes,
- "state vector allocation",
- Some(qubits),
- )?;
+ ensure_device_memory_available(
+ requested_bytes,
+ "state vector allocation (f32)",
+ Some(qubits),
+ )?;
- // Use uninitialized allocation to avoid memory bandwidth waste.
- // TODO: Consider using a memory pool for input buffers to avoid
repeated
- // cudaMalloc overhead in high-frequency encode() calls.
- let slice =
- unsafe { _device.alloc::<CuDoubleComplex>(_size_elements)
}.map_err(|e| {
+ let slice = unsafe {
_device.alloc::<CuComplex>(_size_elements) }.map_err(|e| {
map_allocation_error(
requested_bytes,
- "state vector allocation",
+ "state vector allocation (f32)",
Some(qubits),
e,
)
})?;
- Ok(Self {
- buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { slice })),
- num_qubits: qubits,
- size_elements: _size_elements,
- num_samples: None,
- device_id: _device.ordinal(),
- })
- }
+ BufferStorage::F32(GpuBufferRaw { slice })
+ }
+ Precision::Float64 => {
+ let requested_bytes = _size_elements
+ .checked_mul(std::mem::size_of::<CuDoubleComplex>())
+ .ok_or_else(|| {
+ MahoutError::MemoryAllocation(format!(
+ "Requested GPU allocation size overflow
(elements={})",
+ _size_elements
+ ))
+ })?;
- #[cfg(not(target_os = "linux"))]
- {
- // Non-Linux: compiles but GPU unavailable
- Err(MahoutError::Cuda(
- "CUDA is only available on Linux. This build does not support
GPU operations."
- .to_string(),
- ))
- }
+ ensure_device_memory_available(
+ requested_bytes,
+ "state vector allocation",
+ Some(qubits),
+ )?;
+
+ let slice =
+ unsafe { _device.alloc::<CuDoubleComplex>(_size_elements)
}.map_err(|e| {
+ map_allocation_error(
+ requested_bytes,
+ "state vector allocation",
+ Some(qubits),
+ e,
+ )
+ })?;
+
+ BufferStorage::F64(GpuBufferRaw { slice })
+ }
+ };
+
+ Ok(Self {
+ buffer: Arc::new(buffer),
+ num_qubits: qubits,
+ size_elements: _size_elements,
+ num_samples: None,
+ device_id: _device.ordinal(),
+ })
+ }
+
+ #[cfg(not(target_os = "linux"))]
+ pub fn new(_device: &Arc<CudaDevice>, _qubits: usize, _precision:
Precision) -> Result<Self> {
+ Err(MahoutError::Cuda(
+ "CUDA is only available on Linux. This build does not support GPU
operations."
+ .to_string(),
+ ))
}
/// Get current precision of the underlying buffer.
@@ -293,6 +327,11 @@ impl GpuStateVector {
self.buffer.ptr_f64()
}
+ /// Returns a single-precision pointer if the buffer stores complex64 data.
+ pub fn ptr_f32(&self) -> Option<*mut CuComplex> {
+ self.buffer.ptr_f32()
+ }
+
/// Get the number of qubits
pub fn num_qubits(&self) -> usize {
self.num_qubits
@@ -362,13 +401,88 @@ impl GpuStateVector {
/// Convert the state vector to the requested precision (GPU-side).
///
- /// For now only down-conversion from Float64 -> Float32 is supported.
+ /// Supports Float64 -> Float32 and Float32 -> Float64.
pub fn to_precision(&self, device: &Arc<CudaDevice>, target: Precision) ->
Result<Self> {
if self.precision() == target {
return Ok(self.clone());
}
match (self.precision(), target) {
+ (Precision::Float32, Precision::Float64) => {
+ #[cfg(target_os = "linux")]
+ {
+ let requested_bytes = self
+ .size_elements
+ .checked_mul(std::mem::size_of::<CuDoubleComplex>())
+ .ok_or_else(|| {
+ MahoutError::MemoryAllocation(format!(
+ "Requested GPU allocation size overflow
(elements={})",
+ self.size_elements
+ ))
+ })?;
+
+ ensure_device_memory_available(
+ requested_bytes,
+ "state vector precision conversion",
+ Some(self.num_qubits),
+ )?;
+
+ let slice = unsafe {
device.alloc::<CuDoubleComplex>(self.size_elements) }
+ .map_err(|e| {
+ map_allocation_error(
+ requested_bytes,
+ "state vector precision conversion",
+ Some(self.num_qubits),
+ e,
+ )
+ })?;
+
+ let src_ptr = self.ptr_f32().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "Source state vector is not Float32; cannot
convert to Float64"
+ .to_string(),
+ )
+ })?;
+
+ let ret = unsafe {
+ qdp_kernels::convert_state_to_double(
+ src_ptr as *const CuComplex,
+ *slice.device_ptr() as *mut CuDoubleComplex,
+ self.size_elements,
+ std::ptr::null_mut(),
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Precision conversion kernel failed: {}",
+ ret
+ )));
+ }
+
+ device.synchronize().map_err(|e| {
+ MahoutError::Cuda(format!(
+ "Failed to sync after precision conversion: {:?}",
+ e
+ ))
+ })?;
+
+ Ok(Self {
+ buffer: Arc::new(BufferStorage::F64(GpuBufferRaw {
slice })),
+ num_qubits: self.num_qubits,
+ size_elements: self.size_elements,
+ num_samples: self.num_samples,
+ device_id: device.ordinal(),
+ })
+ }
+
+ #[cfg(not(target_os = "linux"))]
+ {
+ Err(MahoutError::Cuda(
+ "Precision conversion requires CUDA
(Linux)".to_string(),
+ ))
+ }
+ }
(Precision::Float64, Precision::Float32) => {
#[cfg(target_os = "linux")]
{
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 3de648fc9..a8028c62e 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -447,7 +447,7 @@ impl QdpEngine {
let state_vector = {
crate::profile_scope!("GPU::Alloc");
- gpu::GpuStateVector::new(&self.device, num_qubits)?
+ gpu::GpuStateVector::new(&self.device, num_qubits,
Precision::Float64)?
};
let inv_norm = {
@@ -508,7 +508,7 @@ impl QdpEngine {
let state_vector = {
crate::profile_scope!("GPU::Alloc");
- gpu::GpuStateVector::new(&self.device, num_qubits)?
+ gpu::GpuStateVector::new(&self.device, num_qubits,
Precision::Float64)?
};
let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/tests/dlpack.rs b/qdp/qdp-core/tests/dlpack.rs
index 6b97283ce..3c039b371 100644
--- a/qdp/qdp-core/tests/dlpack.rs
+++ b/qdp/qdp-core/tests/dlpack.rs
@@ -21,6 +21,7 @@ mod dlpack_tests {
use std::ffi::c_void;
use cudarc::driver::CudaDevice;
+ use qdp_core::Precision;
use qdp_core::dlpack::{CUDA_STREAM_LEGACY, synchronize_stream};
use qdp_core::gpu::memory::GpuStateVector;
@@ -59,8 +60,8 @@ mod dlpack_tests {
let device = CudaDevice::new(0).unwrap();
let num_qubits = 2;
- let state_vector =
- GpuStateVector::new(&device, num_qubits).expect("Failed to create
state vector");
+ let state_vector = GpuStateVector::new(&device, num_qubits,
Precision::Float64)
+ .expect("Failed to create state vector");
let dlpack_ptr = state_vector.to_dlpack();
assert!(!dlpack_ptr.is_null());
@@ -86,6 +87,39 @@ mod dlpack_tests {
}
}
+ #[test]
+ #[cfg(target_os = "linux")]
+ fn test_dlpack_single_shape_f32() {
+ let device = CudaDevice::new(0).unwrap();
+
+ let num_qubits = 2;
+ let state_vector = GpuStateVector::new(&device, num_qubits,
Precision::Float32)
+ .expect("Failed to create Float32 state vector");
+
+ assert!(
+ state_vector.ptr_f32().is_some(),
+ "Float32 state vector should have ptr_f32()"
+ );
+ assert!(
+ state_vector.ptr_f64().is_none(),
+ "Float32 state vector should not have ptr_f64()"
+ );
+
+ let dlpack_ptr = state_vector.to_dlpack();
+ assert!(!dlpack_ptr.is_null());
+
+ unsafe {
+ let tensor = &(*dlpack_ptr).dl_tensor;
+ assert_eq!(tensor.ndim, 2, "DLPack tensor should be 2D");
+ let shape = std::slice::from_raw_parts(tensor.shape, 2);
+ assert_eq!(shape[0], 1);
+ assert_eq!(shape[1], (1 << num_qubits) as i64);
+ if let Some(deleter) = (*dlpack_ptr).deleter {
+ deleter(dlpack_ptr);
+ }
+ }
+ }
+
/// synchronize_stream(null) is a no-op and returns Ok(()) on all
platforms.
#[test]
fn test_synchronize_stream_null() {
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index aa21059e4..06676e081 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -777,6 +777,42 @@ int convert_state_to_float(
return (int)cudaGetLastError();
}
+/// Kernel: convert complex64 state vector to complex128.
+__global__ void convert_state_to_complex128_kernel(
+ const cuComplex* __restrict__ input_state,
+ cuDoubleComplex* __restrict__ output_state,
+ size_t len
+) {
+ const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx >= len) return;
+
+ const cuComplex v = input_state[idx];
+ output_state[idx] = make_cuDoubleComplex((double)v.x, (double)v.y);
+}
+
+/// Launch conversion kernel from complex64 to complex128.
+int convert_state_to_double(
+ const cuComplex* input_state_d,
+ cuDoubleComplex* output_state_d,
+ size_t len,
+ cudaStream_t stream
+) {
+ if (len == 0) {
+ return cudaErrorInvalidValue;
+ }
+
+ const int blockSize = DEFAULT_BLOCK_SIZE;
+ const int gridSize = (int)((len + blockSize - 1) / blockSize);
+
+ convert_state_to_complex128_kernel<<<gridSize, blockSize, 0, stream>>>(
+ input_state_d,
+ output_state_d,
+ len
+ );
+
+ return (int)cudaGetLastError();
+}
+
// TODO: Future encoding methods:
// - launch_angle_encode (angle encoding)
// - launch_iqp_encode (IQP encoding)
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index e4d0b9976..2bbd21609 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -160,6 +160,18 @@ unsafe extern "C" {
stream: *mut c_void,
) -> i32;
+ /// Convert a complex64 state vector to complex128 on GPU.
+ /// Returns CUDA error code (0 = success).
+ ///
+ /// # Safety
+ /// Pointers must reference valid device memory on the provided stream.
+ pub fn convert_state_to_double(
+ input_state_d: *const CuComplex,
+ output_state_d: *mut CuDoubleComplex,
+ len: usize,
+ stream: *mut c_void,
+ ) -> i32;
+
/// Launch basis encoding kernel
/// Maps an integer index to a computational basis state.
/// Returns CUDA error code (0 = success)
@@ -343,6 +355,17 @@ pub extern "C" fn convert_state_to_float(
999
}
+#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[unsafe(no_mangle)]
+pub extern "C" fn convert_state_to_double(
+ _input_state_d: *const CuComplex,
+ _output_state_d: *mut CuDoubleComplex,
+ _len: usize,
+ _stream: *mut c_void,
+) -> i32 {
+ 999
+}
+
#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
#[unsafe(no_mangle)]
pub extern "C" fn launch_basis_encode(