This is an automated email from the ASF dual-hosted git repository.
jiekaichang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 3e401f977 [QDP] Extend CUDA tensor direct GPU encode to angle (#932)
3e401f977 is described below
commit 3e401f9778d41b86f5ce1e644b26f1e5b52f8f76
Author: Jie-Kai Chang <[email protected]>
AuthorDate: Thu Jan 29 19:38:11 2026 +0800
[QDP] Extend CUDA tensor direct GPU encode to angle (#932)
* Extend CUDA tensor direct GPU encode to angle
Signed-off-by: 400Ping <[email protected]>
* fix pre-commit
Signed-off-by: 400Ping <[email protected]>
* update
Signed-off-by: 400Ping <[email protected]>
* fix conflicts
Signed-off-by: 400Ping <[email protected]>
---------
Signed-off-by: 400Ping <[email protected]>
Signed-off-by: 400Ping <[email protected]>
---
qdp/qdp-core/src/lib.rs | 507 ++++++++++++++++++++++++++++++----------------
qdp/qdp-python/src/lib.rs | 7 +-
2 files changed, 339 insertions(+), 175 deletions(-)
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 2b14f6c5b..f6b58d50a 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -319,7 +319,7 @@ impl QdpEngine {
/// * `input_d` - Device pointer to input data (f64 array on GPU)
/// * `input_len` - Number of f64 elements in the input
/// * `num_qubits` - Number of qubits for encoding
- /// * `encoding_method` - Strategy (currently only "amplitude" supported)
+ /// * `encoding_method` - Strategy (currently "amplitude" and "angle"
supported)
///
/// # Returns
/// DLPack pointer for zero-copy PyTorch integration
@@ -339,13 +339,6 @@ impl QdpEngine {
) -> Result<*mut DLManagedTensor> {
crate::profile_scope!("Mahout::EncodeFromGpuPtr");
- if encoding_method != "amplitude" {
- return Err(MahoutError::NotImplemented(format!(
- "GPU pointer encoding currently only supports 'amplitude'
method, got '{}'",
- encoding_method
- )));
- }
-
if input_len == 0 {
return Err(MahoutError::InvalidInput(
"Input data cannot be empty".into(),
@@ -353,68 +346,127 @@ impl QdpEngine {
}
let state_len = 1usize << num_qubits;
- if input_len > state_len {
- return Err(MahoutError::InvalidInput(format!(
- "Input size {} exceeds state vector size {} (2^{} qubits)",
- input_len, state_len, num_qubits
- )));
- }
-
- // Allocate output state vector
- let state_vector = {
- crate::profile_scope!("GPU::Alloc");
- gpu::GpuStateVector::new(&self.device, num_qubits)?
- };
-
- // Compute inverse L2 norm on GPU
- let inv_norm = {
- crate::profile_scope!("GPU::NormFromPtr");
- // SAFETY: input_d validity is guaranteed by the caller's safety
contract
- unsafe {
- gpu::AmplitudeEncoder::calculate_inv_norm_gpu(&self.device,
input_d, input_len)?
+ let method = encoding_method.to_ascii_lowercase();
+
+ match method.as_str() {
+ "amplitude" => {
+ if input_len > state_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "Input size {} exceeds state vector size {} (2^{}
qubits)",
+ input_len, state_len, num_qubits
+ )));
+ }
+
+ let state_vector = {
+ crate::profile_scope!("GPU::Alloc");
+ gpu::GpuStateVector::new(&self.device, num_qubits)?
+ };
+
+ let inv_norm = {
+ crate::profile_scope!("GPU::NormFromPtr");
+ // SAFETY: input_d validity is guaranteed by the caller's
safety contract
+ unsafe {
+ gpu::AmplitudeEncoder::calculate_inv_norm_gpu(
+ &self.device,
+ input_d,
+ input_len,
+ )?
+ }
+ };
+
+ let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+
+ {
+ crate::profile_scope!("GPU::KernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_amplitude_encode(
+ input_d,
+ state_ptr as *mut std::ffi::c_void,
+ input_len,
+ state_len,
+ inv_norm,
+ std::ptr::null_mut(), // default stream
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Amplitude encode kernel failed with CUDA error
code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ self.device.synchronize().map_err(|e| {
+ MahoutError::Cuda(format!("CUDA device synchronize
failed: {:?}", e))
+ })?;
+ }
+
+ let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(state_vector.to_dlpack())
}
- };
-
- // Get output pointer
- let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
- MahoutError::InvalidInput(
- "State vector precision mismatch (expected float64
buffer)".to_string(),
- )
- })?;
-
- // Launch encoding kernel
- {
- crate::profile_scope!("GPU::KernelLaunch");
- let ret = unsafe {
- qdp_kernels::launch_amplitude_encode(
- input_d,
- state_ptr as *mut std::ffi::c_void,
- input_len,
- state_len,
- inv_norm,
- std::ptr::null_mut(), // default stream
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Amplitude encode kernel failed with CUDA error code: {}
({})",
- ret,
- cuda_error_to_string(ret)
- )));
+ "angle" => {
+ if input_len != num_qubits {
+ return Err(MahoutError::InvalidInput(format!(
+ "Angle encoding expects {} values (one per qubit), got
{}",
+ num_qubits, input_len
+ )));
+ }
+
+ let state_vector = {
+ crate::profile_scope!("GPU::Alloc");
+ gpu::GpuStateVector::new(&self.device, num_qubits)?
+ };
+
+ let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+
+ {
+ crate::profile_scope!("GPU::KernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_angle_encode(
+ input_d,
+ state_ptr as *mut std::ffi::c_void,
+ state_len,
+ num_qubits as u32,
+ std::ptr::null_mut(), // default stream
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Angle encoding kernel failed with CUDA error
code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ self.device.synchronize().map_err(|e| {
+ MahoutError::Cuda(format!("CUDA device synchronize
failed: {:?}", e))
+ })?;
+ }
+
+ let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(state_vector.to_dlpack())
}
+ _ => Err(MahoutError::NotImplemented(format!(
+ "GPU pointer encoding currently only supports 'amplitude' and
'angle' methods, got '{}'",
+ encoding_method
+ ))),
}
-
- // Synchronize
- {
- crate::profile_scope!("GPU::Synchronize");
- self.device.synchronize().map_err(|e| {
- MahoutError::Cuda(format!("CUDA device synchronize failed:
{:?}", e))
- })?;
- }
-
- let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
- Ok(state_vector.to_dlpack())
}
/// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
@@ -428,7 +480,7 @@ impl QdpEngine {
/// * `num_samples` - Number of samples in the batch
/// * `sample_size` - Size of each sample in f64 elements
/// * `num_qubits` - Number of qubits for encoding
- /// * `encoding_method` - Strategy (currently only "amplitude" supported)
+ /// * `encoding_method` - Strategy (currently "amplitude" and "angle"
supported)
///
/// # Returns
/// Single DLPack pointer containing all encoded states (shape:
[num_samples, 2^num_qubits])
@@ -449,13 +501,6 @@ impl QdpEngine {
) -> Result<*mut DLManagedTensor> {
crate::profile_scope!("Mahout::EncodeBatchFromGpuPtr");
- if encoding_method != "amplitude" {
- return Err(MahoutError::NotImplemented(format!(
- "GPU pointer batch encoding currently only supports
'amplitude' method, got '{}'",
- encoding_method
- )));
- }
-
if num_samples == 0 {
return Err(MahoutError::InvalidInput(
"Number of samples cannot be zero".into(),
@@ -469,106 +514,224 @@ impl QdpEngine {
}
let state_len = 1usize << num_qubits;
- if sample_size > state_len {
- return Err(MahoutError::InvalidInput(format!(
- "Sample size {} exceeds state vector size {} (2^{} qubits)",
- sample_size, state_len, num_qubits
- )));
- }
-
- // Allocate output state vector
- let batch_state_vector = {
- crate::profile_scope!("GPU::AllocBatch");
- gpu::GpuStateVector::new_batch(&self.device, num_samples,
num_qubits)?
- };
-
- // Compute inverse norms on GPU using warp-reduced kernel
- let inv_norms_gpu = {
- crate::profile_scope!("GPU::BatchNormKernel");
- use cudarc::driver::DevicePtrMut;
-
- let mut buffer =
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
- MahoutError::MemoryAllocation(format!("Failed to allocate norm
buffer: {:?}", e))
- })?;
-
- let ret = unsafe {
- qdp_kernels::launch_l2_norm_batch(
- input_batch_d,
- num_samples,
- sample_size,
- *buffer.device_ptr_mut() as *mut f64,
- std::ptr::null_mut(), // default stream
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Norm reduction kernel failed with CUDA error code: {}
({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
-
- buffer
- };
-
- // Validate norms on host to catch zero or NaN samples early
- {
- crate::profile_scope!("GPU::NormValidation");
- let host_inv_norms = self
- .device
- .dtoh_sync_copy(&inv_norms_gpu)
- .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms
to host: {:?}", e)))?;
-
- if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
- return Err(MahoutError::InvalidInput(
- "One or more samples have zero or invalid
norm".to_string(),
- ));
+ let method = encoding_method.to_ascii_lowercase();
+
+ match method.as_str() {
+ "amplitude" => {
+ if sample_size > state_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "Sample size {} exceeds state vector size {} (2^{}
qubits)",
+ sample_size, state_len, num_qubits
+ )));
+ }
+
+ let batch_state_vector = {
+ crate::profile_scope!("GPU::AllocBatch");
+ gpu::GpuStateVector::new_batch(&self.device, num_samples,
num_qubits)?
+ };
+
+ let inv_norms_gpu = {
+ crate::profile_scope!("GPU::BatchNormKernel");
+ use cudarc::driver::DevicePtrMut;
+
+ let mut buffer =
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+ MahoutError::MemoryAllocation(format!(
+ "Failed to allocate norm buffer: {:?}",
+ e
+ ))
+ })?;
+
+ let ret = unsafe {
+ qdp_kernels::launch_l2_norm_batch(
+ input_batch_d,
+ num_samples,
+ sample_size,
+ *buffer.device_ptr_mut() as *mut f64,
+ std::ptr::null_mut(), // default stream
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Norm reduction kernel failed with CUDA error
code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+
+ buffer
+ };
+
+ {
+ crate::profile_scope!("GPU::NormValidation");
+ let host_inv_norms =
+ self.device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e|
{
+ MahoutError::Cuda(format!("Failed to copy norms to
host: {:?}", e))
+ })?;
+
+ if host_inv_norms.iter().any(|v| !v.is_finite() || *v ==
0.0) {
+ return Err(MahoutError::InvalidInput(
+ "One or more samples have zero or invalid
norm".to_string(),
+ ));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::BatchKernelLaunch");
+ use cudarc::driver::DevicePtr;
+
+ let state_ptr = batch_state_vector.ptr_f64().ok_or_else(||
{
+ MahoutError::InvalidInput(
+ "Batch state vector precision mismatch (expected
float64 buffer)"
+ .to_string(),
+ )
+ })?;
+
+ let ret = unsafe {
+ qdp_kernels::launch_amplitude_encode_batch(
+ input_batch_d,
+ state_ptr as *mut std::ffi::c_void,
+ *inv_norms_gpu.device_ptr() as *const f64,
+ num_samples,
+ sample_size,
+ state_len,
+ std::ptr::null_mut(), // default stream
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Batch kernel launch failed with CUDA error code:
{} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ self.device
+ .synchronize()
+ .map_err(|e| MahoutError::Cuda(format!("Sync failed:
{:?}", e)))?;
+ }
+
+ let batch_state_vector =
+ batch_state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(batch_state_vector.to_dlpack())
}
- }
-
- // Launch batch kernel
- {
- crate::profile_scope!("GPU::BatchKernelLaunch");
- use cudarc::driver::DevicePtr;
-
- let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
- MahoutError::InvalidInput(
- "Batch state vector precision mismatch (expected float64
buffer)".to_string(),
- )
- })?;
-
- let ret = unsafe {
- qdp_kernels::launch_amplitude_encode_batch(
- input_batch_d,
- state_ptr as *mut std::ffi::c_void,
- *inv_norms_gpu.device_ptr() as *const f64,
- num_samples,
- sample_size,
- state_len,
- std::ptr::null_mut(), // default stream
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Batch kernel launch failed with CUDA error code: {} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
+ "angle" => {
+ use cudarc::driver::DevicePtrMut;
+
+ if sample_size != num_qubits {
+ return Err(MahoutError::InvalidInput(format!(
+ "Angle encoding expects sample_size={} (one angle per
qubit), got {}",
+ num_qubits, sample_size
+ )));
+ }
+
+ // Validate that all input angles are finite (no NaN/Inf),
consistent with
+ // CPU and host-side batch angle encoding paths.
+ let angle_validation_buffer = {
+ crate::profile_scope!("GPU::AngleFiniteCheckBatch");
+
+ let mut buffer =
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+ MahoutError::MemoryAllocation(format!(
+ "Failed to allocate angle validation buffer: {:?}",
+ e
+ ))
+ })?;
+
+ let ret = unsafe {
+ qdp_kernels::launch_l2_norm_batch(
+ input_batch_d,
+ num_samples,
+ sample_size,
+ *buffer.device_ptr_mut() as *mut f64,
+ std::ptr::null_mut(), // default stream
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Angle validation norm kernel failed with CUDA
error code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+
+ buffer
+ };
+
+ {
+
crate::profile_scope!("GPU::AngleFiniteValidationHostCopy");
+ let host_norms = self
+ .device
+ .dtoh_sync_copy(&angle_validation_buffer)
+ .map_err(|e| {
+ MahoutError::Cuda(format!(
+ "Failed to copy angle validation norms to
host: {:?}",
+ e
+ ))
+ })?;
+
+ if host_norms.iter().any(|v| !v.is_finite()) {
+ return Err(MahoutError::InvalidInput(
+ "Angle encoding batch contains non-finite values
(NaN or Inf)"
+ .to_string(),
+ ));
+ }
+ }
+
+ let batch_state_vector = {
+ crate::profile_scope!("GPU::AllocBatch");
+ gpu::GpuStateVector::new_batch(&self.device, num_samples,
num_qubits)?
+ };
+
+ let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "Batch state vector precision mismatch (expected
float64 buffer)"
+ .to_string(),
+ )
+ })?;
+
+ {
+ crate::profile_scope!("GPU::BatchKernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_angle_encode_batch(
+ input_batch_d,
+ state_ptr as *mut std::ffi::c_void,
+ num_samples,
+ state_len,
+ num_qubits as u32,
+ std::ptr::null_mut(), // default stream
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Batch angle encoding kernel failed: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ self.device
+ .synchronize()
+ .map_err(|e| MahoutError::Cuda(format!("Sync failed:
{:?}", e)))?;
+ }
+
+ let batch_state_vector =
+ batch_state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(batch_state_vector.to_dlpack())
}
+ _ => Err(MahoutError::NotImplemented(format!(
+ "GPU pointer batch encoding currently only supports
'amplitude' and 'angle' methods, got '{}'",
+ encoding_method
+ ))),
}
-
- // Synchronize
- {
- crate::profile_scope!("GPU::Synchronize");
- self.device
- .synchronize()
- .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}",
e)))?;
- }
-
- let batch_state_vector = batch_state_vector.to_precision(&self.device,
self.precision)?;
- Ok(batch_state_vector.to_dlpack())
}
}
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs
index 0df6cedc6..8647d98b7 100644
--- a/qdp/qdp-python/src/lib.rs
+++ b/qdp/qdp-python/src/lib.rs
@@ -218,10 +218,11 @@ fn validate_cuda_tensor_for_encoding(
expected_device_id: usize,
encoding_method: &str,
) -> PyResult<()> {
- // Check encoding method support (currently only amplitude is supported
for CUDA tensors)
- if encoding_method != "amplitude" {
+ let method = encoding_method.to_ascii_lowercase();
+ // Check encoding method support (currently amplitude and angle are
supported for CUDA tensors)
+ if method != "amplitude" && method != "angle" {
return Err(PyRuntimeError::new_err(format!(
- "CUDA tensor encoding currently only supports 'amplitude' method,
got '{}'. \
+ "CUDA tensor encoding currently only supports 'amplitude' and
'angle' methods, got '{}'. \
Use tensor.cpu() to convert to CPU tensor for other encoding
methods.",
encoding_method
)));