This is an automated email from the ASF dual-hosted git repository.
hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new c7e38cb94 feat: support CUDA tensor GPU-pointer encoding for iqp and
iqp-z (#1115)
c7e38cb94 is described below
commit c7e38cb94121972ec4d3d830cb2816f46e1f0c74
Author: Vic Wen <[email protected]>
AuthorDate: Mon Mar 9 01:11:52 2026 +0800
feat: support CUDA tensor GPU-pointer encoding for iqp and iqp-z (#1115)
* feat: support GPU-pointer encoding
* test: add testing for qdp-core
* fix: handle misaligned float32 batch amplitude loads
* fix: expected data length calculation and consolidate qubit input
validation
* test: add more testing, not only happy path
---------
Co-authored-by: Ryan Huang <[email protected]>
---
qdp/qdp-core/src/gpu/encodings/iqp.rs | 158 ++++++++++++-
qdp/qdp-core/tests/gpu_ptr_encoding.rs | 419 +++++++++++++++++++++++++++++++++
2 files changed, 569 insertions(+), 8 deletions(-)
diff --git a/qdp/qdp-core/src/gpu/encodings/iqp.rs
b/qdp/qdp-core/src/gpu/encodings/iqp.rs
index bcdc15018..78a1274f8 100644
--- a/qdp/qdp-core/src/gpu/encodings/iqp.rs
+++ b/qdp/qdp-core/src/gpu/encodings/iqp.rs
@@ -57,7 +57,8 @@ impl IqpEncoder {
fn expected_data_len(&self, num_qubits: usize) -> usize {
if self.enable_zz {
// n single-qubit + n*(n-1)/2 two-qubit terms
- num_qubits + num_qubits * (num_qubits - 1) / 2
+ let n_minus_one = num_qubits.saturating_sub(1);
+ num_qubits + num_qubits * n_minus_one / 2
} else {
num_qubits
}
@@ -148,6 +149,13 @@ impl QuantumEncoder for IqpEncoder {
) -> Result<GpuStateVector> {
crate::profile_scope!("IqpEncoder::encode_batch");
+ if num_qubits == 0 || num_qubits > 30 {
+ return Err(MahoutError::InvalidInput(format!(
+ "Number of qubits {} must be between 1 and 30",
+ num_qubits
+ )));
+ }
+
let expected_len = self.expected_data_len(num_qubits);
if sample_size != expected_len {
return Err(MahoutError::InvalidInput(format!(
@@ -168,13 +176,6 @@ impl QuantumEncoder for IqpEncoder {
)));
}
- if num_qubits == 0 || num_qubits > 30 {
- return Err(MahoutError::InvalidInput(format!(
- "Number of qubits {} must be between 1 and 30",
- num_qubits
- )));
- }
-
for (i, &val) in batch_data.iter().enumerate() {
if !val.is_finite() {
let sample_idx = i / sample_size;
@@ -241,6 +242,147 @@ impl QuantumEncoder for IqpEncoder {
Ok(batch_state_vector)
}
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_from_gpu_ptr(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_d: *const c_void,
+ input_len: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ if num_qubits == 0 || num_qubits > 30 {
+ return Err(MahoutError::InvalidInput(format!(
+ "Number of qubits {} must be between 1 and 30",
+ num_qubits
+ )));
+ }
+
+ let expected_len = self.expected_data_len(num_qubits);
+ if input_len != expected_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "IQP{} encoding expects {} values for {} qubits, got {}",
+ if self.enable_zz { "" } else { "-Z" },
+ expected_len,
+ num_qubits,
+ input_len
+ )));
+ }
+
+ let state_len = 1 << num_qubits;
+ let state_vector = {
+ crate::profile_scope!("GPU::Alloc");
+ GpuStateVector::new(device, num_qubits, Precision::Float64)?
+ };
+
+ let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+
+ let ret = {
+ crate::profile_scope!("GPU::KernelLaunch");
+ unsafe {
+ qdp_kernels::launch_iqp_encode(
+ input_d as *const f64,
+ state_ptr as *mut c_void,
+ state_len,
+ num_qubits as u32,
+ if self.enable_zz { 1 } else { 0 },
+ stream,
+ )
+ }
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "IQP encoding kernel failed with CUDA error code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
+ }
+
+ Ok(state_vector)
+ }
+
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_batch_from_gpu_ptr(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_batch_d: *const c_void,
+ num_samples: usize,
+ sample_size: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ if num_qubits == 0 || num_qubits > 30 {
+ return Err(MahoutError::InvalidInput(format!(
+ "Number of qubits {} must be between 1 and 30",
+ num_qubits
+ )));
+ }
+
+ let expected_len = self.expected_data_len(num_qubits);
+ if sample_size != expected_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "IQP{} encoding expects sample_size={} for {} qubits, got {}",
+ if self.enable_zz { "" } else { "-Z" },
+ expected_len,
+ num_qubits,
+ sample_size
+ )));
+ }
+
+ let state_len = 1 << num_qubits;
+ let batch_state_vector = {
+ crate::profile_scope!("GPU::AllocBatch");
+ GpuStateVector::new_batch(device, num_samples, num_qubits,
Precision::Float64)?
+ };
+
+ let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "Batch state vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+
+ {
+ crate::profile_scope!("GPU::BatchKernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_iqp_encode_batch(
+ input_batch_d as *const f64,
+ state_ptr as *mut c_void,
+ num_samples,
+ state_len,
+ num_qubits as u32,
+ sample_size as u32,
+ if self.enable_zz { 1 } else { 0 },
+ stream,
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Batch IQP encoding kernel failed: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
+ }
+
+ Ok(batch_state_vector)
+ }
+
fn validate_input(&self, data: &[f64], num_qubits: usize) -> Result<()> {
if num_qubits == 0 {
return Err(MahoutError::InvalidInput(
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index 97d648fdc..ef7ffb9d5 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -26,6 +26,16 @@ use qdp_core::{MahoutError, Precision, QdpEngine};
mod common;
+/// IQP full encoding expected data length: n + n*(n-1)/2.
+fn iqp_full_data_len(num_qubits: usize) -> usize {
+ num_qubits + num_qubits * (num_qubits.saturating_sub(1)) / 2
+}
+
+/// IQP-Z encoding expected data length: n.
+fn iqp_z_data_len(num_qubits: usize) -> usize {
+ num_qubits
+}
+
// ---- Helpers for f32 encode_from_gpu_ptr_f32 tests ----
fn engine_f32() -> Option<QdpEngine> {
@@ -541,6 +551,415 @@ fn test_encode_batch_from_gpu_ptr_basis_success() {
}
}
+#[test]
+fn test_encode_batch_from_gpu_ptr_iqp_success() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+ let num_qubits = 2;
+ let state_len = 1 << num_qubits;
+ let sample_size = iqp_full_data_len(num_qubits);
+ let num_samples = 3;
+ let total = num_samples * sample_size;
+ let data: Vec<f64> = (0..total).map(|i| (i as f64) * 0.05).collect();
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_batch_from_gpu_ptr(ptr, num_samples, sample_size,
num_qubits, "iqp")
+ .expect("encode_batch_from_gpu_ptr iqp should succeed")
+ };
+ assert!(!dlpack_ptr.is_null());
+ assert_dlpack_batch_shape_and_delete(dlpack_ptr, num_samples as i64,
state_len as i64);
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_iqp_z_success() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+ let num_qubits = 2;
+ let state_len = 1 << num_qubits;
+ let sample_size = iqp_z_data_len(num_qubits);
+ let num_samples = 3;
+ let total = num_samples * sample_size;
+ let data: Vec<f64> = (0..total).map(|i| (i as f64) * 0.05).collect();
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_batch_from_gpu_ptr(ptr, num_samples, sample_size,
num_qubits, "iqp-z")
+ .expect("encode_batch_from_gpu_ptr iqp-z should succeed")
+ };
+ assert!(!dlpack_ptr.is_null());
+ assert_dlpack_batch_shape_and_delete(dlpack_ptr, num_samples as i64,
state_len as i64);
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_iqp_wrong_sample_size() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+ let num_qubits = 2;
+ let expected_sample_size = iqp_full_data_len(num_qubits);
+ let wrong_sample_size = expected_sample_size + 1;
+ let num_samples = 2;
+ let data = vec![0.1_f64; num_samples * wrong_sample_size];
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+ let result = unsafe {
+ engine.encode_batch_from_gpu_ptr(ptr, num_samples, wrong_sample_size,
num_qubits, "iqp")
+ };
+ assert!(result.is_err());
+ match &result {
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(
+ msg.contains("expects") || msg.contains("sample_size"),
+ "msg: {}",
+ msg
+ );
+ }
+ _ => panic!("expected InvalidInput"),
+ }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_iqp_z_wrong_sample_size() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+ let num_qubits = 2;
+ let expected_sample_size = iqp_z_data_len(num_qubits);
+ let wrong_sample_size = expected_sample_size + 1;
+ let num_samples = 2;
+ let data = vec![0.1_f64; num_samples * wrong_sample_size];
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+ let result = unsafe {
+ engine.encode_batch_from_gpu_ptr(ptr, num_samples, wrong_sample_size,
num_qubits, "iqp-z")
+ };
+ assert!(result.is_err());
+ match &result {
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(
+ msg.contains("expects") || msg.contains("sample_size"),
+ "msg: {}",
+ msg
+ );
+ }
+ _ => panic!("expected InvalidInput"),
+ }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_iqp_z_success() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => {
+ println!("SKIP: No GPU available");
+ return;
+ }
+ };
+
+ let num_qubits = 2;
+ let data = [0.1_f64, -0.2_f64];
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => {
+ println!("SKIP: Failed to copy to device");
+ return;
+ }
+ };
+
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr(ptr, data.len(), num_qubits, "iqp-z")
+ .expect("encode_from_gpu_ptr iqp-z should succeed")
+ };
+
+ assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_iqp_success() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => {
+ println!("SKIP: No GPU available");
+ return;
+ }
+ };
+
+ let num_qubits = 2;
+ let data = [0.1_f64, -0.2_f64, 0.3_f64];
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => {
+ println!("SKIP: Failed to copy to device");
+ return;
+ }
+ };
+
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr(ptr, data.len(), num_qubits, "iqp")
+ .expect("encode_from_gpu_ptr iqp should succeed")
+ };
+
+ assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_iqp_wrong_input_len() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+ let num_qubits = 2;
+ let expected_len = iqp_full_data_len(num_qubits);
+ let data = vec![0.1_f64; expected_len];
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+ let result_too_few =
+ unsafe { engine.encode_from_gpu_ptr(ptr, expected_len - 1, num_qubits,
"iqp") };
+ assert!(result_too_few.is_err());
+ match &result_too_few {
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(msg.contains("expects") || msg.contains("sample"))
+ }
+ _ => panic!("expected InvalidInput"),
+ }
+
+ let result_too_many =
+ unsafe { engine.encode_from_gpu_ptr(ptr, expected_len + 1, num_qubits,
"iqp") };
+ assert!(result_too_many.is_err());
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_iqp_z_wrong_input_len() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+ let num_qubits = 2;
+ let expected_len = iqp_z_data_len(num_qubits);
+ let data = vec![0.1_f64; expected_len];
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+ let result = unsafe { engine.encode_from_gpu_ptr(ptr, expected_len + 1,
num_qubits, "iqp-z") };
+ assert!(result.is_err());
+ match &result {
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(msg.contains("expects") || msg.contains("sample"))
+ }
+ _ => panic!("expected InvalidInput"),
+ }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_with_stream_iqp_success() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+ let num_qubits = 2;
+ let data = [0.1_f64, -0.2_f64, 0.3_f64];
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr_with_stream(
+ ptr,
+ data.len(),
+ num_qubits,
+ "iqp",
+ std::ptr::null_mut(),
+ )
+ .expect("encode_from_gpu_ptr_with_stream iqp")
+ };
+ assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_with_stream_iqp_z_success() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+ let num_qubits = 2;
+ let data = [0.1_f64, -0.2_f64];
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr_with_stream(
+ ptr,
+ data.len(),
+ num_qubits,
+ "iqp-z",
+ std::ptr::null_mut(),
+ )
+ .expect("encode_from_gpu_ptr_with_stream iqp-z")
+ };
+ assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_iqp_three_qubits() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+ let num_qubits = 3;
+ let state_len = 1 << num_qubits;
+ let expected_len = iqp_full_data_len(num_qubits);
+ let data: Vec<f64> = (0..expected_len).map(|i| (i as f64) * 0.1).collect();
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr(ptr, data.len(), num_qubits, "iqp")
+ .expect("encode_from_gpu_ptr iqp 3 qubits")
+ };
+ assert!(!dlpack_ptr.is_null());
+ unsafe {
+ let tensor = &(*dlpack_ptr).dl_tensor;
+ assert_eq!(tensor.ndim, 2);
+ let shape = std::slice::from_raw_parts(tensor.shape, 2);
+ assert_eq!(shape[0], 1);
+ assert_eq!(shape[1], state_len as i64);
+ if let Some(deleter) = (*dlpack_ptr).deleter {
+ deleter(dlpack_ptr);
+ }
+ }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_iqp_z_three_qubits() {
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+ let num_qubits = 3;
+ let state_len = 1 << num_qubits;
+ let expected_len = iqp_z_data_len(num_qubits);
+ let data: Vec<f64> = (0..expected_len).map(|i| (i as f64) * 0.1).collect();
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr(ptr, data.len(), num_qubits, "iqp-z")
+ .expect("encode_from_gpu_ptr iqp-z 3 qubits")
+ };
+ assert!(!dlpack_ptr.is_null());
+ unsafe {
+ let tensor = &(*dlpack_ptr).dl_tensor;
+ assert_eq!(tensor.ndim, 2);
+ let shape = std::slice::from_raw_parts(tensor.shape, 2);
+ assert_eq!(shape[0], 1);
+ assert_eq!(shape[1], state_len as i64);
+ if let Some(deleter) = (*dlpack_ptr).deleter {
+ deleter(dlpack_ptr);
+ }
+ }
+}
+
// ---- encode_from_gpu_ptr_f32 (float32 amplitude) ----
#[test]