Re: [PR] [QDP] improve amplitudeEncoders for less copy memory allocations [mahout]

via GitHub Fri, 05 Dec 2025 07:19:34 -0800


Copilot commented on code in PR #690:
URL: https://github.com/apache/mahout/pull/690#discussion_r2593047127



##########
qdp/qdp-core/src/gpu/encodings/amplitude.rs:
##########
@@ -121,6 +122,129 @@ impl QuantumEncoder for AmplitudeEncoder {
     fn description(&self) -> &'static str {
         "Amplitude encoding with L2 normalization"
     }
+
+    /// Override to avoid intermediate Vec allocation. Processes chunks 
directly to GPU offsets.

Review Comment:
   The comment states "Override" but `encode_chunked` is not part of the 
`QuantumEncoder` trait. This method is being added to the trait implementation 
but doesn't override anything. Consider updating the comment to accurately 
reflect that this is a new method, e.g., "Encodes chunked data to avoid 
intermediate Vec allocation. Processes chunks directly to GPU offsets."
   ```suggestion
       /// Encodes chunked data to avoid intermediate Vec allocation. Processes 
chunks directly to GPU offsets.
   ```



##########
qdp/qdp-core/src/gpu/encodings/amplitude.rs:
##########
@@ -121,6 +122,129 @@ impl QuantumEncoder for AmplitudeEncoder {
     fn description(&self) -> &'static str {
         "Amplitude encoding with L2 normalization"
     }
+
+    /// Override to avoid intermediate Vec allocation. Processes chunks 
directly to GPU offsets.
+    fn encode_chunked(
+        &self,
+        device: &Arc<CudaDevice>,
+        chunks: &[Float64Array],
+        num_qubits: usize,
+    ) -> Result<GpuStateVector> {
+        #[cfg(target_os = "linux")]
+        {
+            let total_len: usize = chunks.iter().map(|c| c.len()).sum();
+            let state_len = 1 << num_qubits;
+
+            if total_len == 0 {
+                return Err(MahoutError::InvalidInput("Input chunks cannot be 
empty".to_string()));
+            }
+            if total_len > state_len {
+                return Err(MahoutError::InvalidInput(
+                    format!("Total data length {} exceeds state vector size 
{}", total_len, state_len)
+                ));
+            }
+            if num_qubits == 0 || num_qubits > 30 {
+                return Err(MahoutError::InvalidInput(
+                    format!("Number of qubits {} must be between 1 and 30", 
num_qubits)
+                ));
+            }
+
+            let state_vector = {
+                crate::profile_scope!("GPU::Alloc");
+                GpuStateVector::new(device, num_qubits)?
+            };
+
+            let norm = {
+                crate::profile_scope!("CPU::L2Norm");
+                let mut norm_sq = 0.0;
+                for chunk in chunks {
+                    if chunk.null_count() == 0 {
+                        norm_sq += chunk.values().iter().map(|&x| x * 
x).sum::<f64>();
+                    } else {
+                        norm_sq += chunk.iter().map(|opt| 
opt.unwrap_or(0.0).powi(2)).sum::<f64>();
+                    }
+                }
+                let norm = norm_sq.sqrt();
+                if norm == 0.0 {
+                    return Err(MahoutError::InvalidInput("Input data has zero 
norm".to_string()));
+                }
+                norm
+            };
+
+            let mut current_offset = 0;
+            for chunk in chunks {
+                let chunk_len = chunk.len();
+                if chunk_len == 0 {
+                    continue;
+                }
+
+                let state_ptr_offset = unsafe {
+                    state_vector.ptr().cast::<u8>()
+                        .add(current_offset * 
std::mem::size_of::<qdp_kernels::CuDoubleComplex>())
+                        .cast::<std::ffi::c_void>()

Review Comment:
   Inconsistent usage of `c_void` type. Line 184 uses the fully qualified path 
`std::ffi::c_void`, while line 222 uses the imported shorthand `c_void`. For 
consistency with the existing codebase pattern (see line 80, 274, 313 which use 
the imported `c_void`), use `c_void` instead of `std::ffi::c_void`.
   ```suggestion
                           .cast::<c_void>()
   ```



##########
qdp/qdp-core/src/gpu/encodings/amplitude.rs:
##########
@@ -121,6 +122,129 @@ impl QuantumEncoder for AmplitudeEncoder {
     fn description(&self) -> &'static str {
         "Amplitude encoding with L2 normalization"
     }
+
+    /// Override to avoid intermediate Vec allocation. Processes chunks 
directly to GPU offsets.
+    fn encode_chunked(
+        &self,
+        device: &Arc<CudaDevice>,
+        chunks: &[Float64Array],
+        num_qubits: usize,
+    ) -> Result<GpuStateVector> {
+        #[cfg(target_os = "linux")]
+        {
+            let total_len: usize = chunks.iter().map(|c| c.len()).sum();
+            let state_len = 1 << num_qubits;
+
+            if total_len == 0 {
+                return Err(MahoutError::InvalidInput("Input chunks cannot be 
empty".to_string()));
+            }
+            if total_len > state_len {
+                return Err(MahoutError::InvalidInput(
+                    format!("Total data length {} exceeds state vector size 
{}", total_len, state_len)
+                ));
+            }
+            if num_qubits == 0 || num_qubits > 30 {
+                return Err(MahoutError::InvalidInput(
+                    format!("Number of qubits {} must be between 1 and 30", 
num_qubits)
+                ));
+            }
+
+            let state_vector = {
+                crate::profile_scope!("GPU::Alloc");
+                GpuStateVector::new(device, num_qubits)?
+            };
+
+            let norm = {
+                crate::profile_scope!("CPU::L2Norm");
+                let mut norm_sq = 0.0;
+                for chunk in chunks {
+                    if chunk.null_count() == 0 {
+                        norm_sq += chunk.values().iter().map(|&x| x * 
x).sum::<f64>();
+                    } else {
+                        norm_sq += chunk.iter().map(|opt| 
opt.unwrap_or(0.0).powi(2)).sum::<f64>();
+                    }
+                }
+                let norm = norm_sq.sqrt();
+                if norm == 0.0 {
+                    return Err(MahoutError::InvalidInput("Input data has zero 
norm".to_string()));
+                }
+                norm
+            };
+
+            let mut current_offset = 0;
+            for chunk in chunks {
+                let chunk_len = chunk.len();
+                if chunk_len == 0 {
+                    continue;
+                }
+
+                let state_ptr_offset = unsafe {
+                    state_vector.ptr().cast::<u8>()
+                        .add(current_offset * 
std::mem::size_of::<qdp_kernels::CuDoubleComplex>())
+                        .cast::<std::ffi::c_void>()
+                };
+
+                let chunk_slice = {
+                    crate::profile_scope!("GPU::ChunkH2DCopy");
+                    if chunk.null_count() == 0 {
+                        device.htod_sync_copy(chunk.values())
+                    } else {
+                        let temp_vec: Vec<f64> = chunk.iter().map(|opt| 
opt.unwrap_or(0.0)).collect();
+                        device.htod_sync_copy(&temp_vec)
+                    }
+                    .map_err(|e| MahoutError::MemoryAllocation(format!("Failed 
to copy chunk: {:?}", e)))?
+                };
+
+                {
+                    crate::profile_scope!("GPU::KernelLaunch");
+                    let ret = unsafe {
+                        launch_amplitude_encode(
+                            *chunk_slice.device_ptr() as *const f64,
+                            state_ptr_offset,
+                            chunk_len,
+                            state_len,
+                            norm,
+                            std::ptr::null_mut(),
+                        )
+                    };
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(
+                            format!("Kernel launch failed: {} ({})", ret, 
cuda_error_to_string(ret))
+                        ));
+                    }
+                }
+
+                current_offset += chunk_len;
+            }
+
+            if total_len < state_len {
+                let padding_bytes = (state_len - total_len) * 
std::mem::size_of::<qdp_kernels::CuDoubleComplex>();
+                let tail_ptr = unsafe { state_vector.ptr().add(total_len) as 
*mut c_void };
+
+                unsafe {
+                    unsafe extern "C" {
+                        fn cudaMemsetAsync(devPtr: *mut c_void, value: i32, 
count: usize, stream: *mut c_void) -> i32;
+                    }
+                    let result = cudaMemsetAsync(tail_ptr, 0, padding_bytes, 
std::ptr::null_mut());
+                    if result != 0 {
+                        return Err(MahoutError::Cuda(
+                            format!("Failed to zero-fill padding: {} ({})", 
result, cuda_error_to_string(result))
+                        ));
+                    }
+                }
+            }
+
+            device.synchronize()
+                .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", 
e)))?;
+
+            Ok(state_vector)
+        }
+
+        #[cfg(not(target_os = "linux"))]
+        {
+            Err(MahoutError::Cuda("CUDA unavailable (non-Linux)".to_string()))
+        }
+    }

Review Comment:
   The `encode_chunked` method is placed inside the `impl QuantumEncoder for 
AmplitudeEncoder` block, but it's not part of the `QuantumEncoder` trait. For 
better code organization and clarity, consider moving this method to the 
inherent `impl AmplitudeEncoder` block (starting at line 250) where other 
AmplitudeEncoder-specific methods like `encode_async_pipeline` are defined.



##########
qdp/qdp-core/src/gpu/encodings/amplitude.rs:
##########
@@ -121,6 +122,129 @@ impl QuantumEncoder for AmplitudeEncoder {
     fn description(&self) -> &'static str {
         "Amplitude encoding with L2 normalization"
     }
+
+    /// Override to avoid intermediate Vec allocation. Processes chunks 
directly to GPU offsets.
+    fn encode_chunked(
+        &self,
+        device: &Arc<CudaDevice>,
+        chunks: &[Float64Array],
+        num_qubits: usize,
+    ) -> Result<GpuStateVector> {
+        #[cfg(target_os = "linux")]
+        {
+            let total_len: usize = chunks.iter().map(|c| c.len()).sum();
+            let state_len = 1 << num_qubits;
+
+            if total_len == 0 {
+                return Err(MahoutError::InvalidInput("Input chunks cannot be 
empty".to_string()));
+            }
+            if total_len > state_len {
+                return Err(MahoutError::InvalidInput(
+                    format!("Total data length {} exceeds state vector size 
{}", total_len, state_len)
+                ));
+            }
+            if num_qubits == 0 || num_qubits > 30 {
+                return Err(MahoutError::InvalidInput(
+                    format!("Number of qubits {} must be between 1 and 30", 
num_qubits)
+                ));
+            }
+
+            let state_vector = {
+                crate::profile_scope!("GPU::Alloc");
+                GpuStateVector::new(device, num_qubits)?
+            };
+
+            let norm = {
+                crate::profile_scope!("CPU::L2Norm");
+                let mut norm_sq = 0.0;
+                for chunk in chunks {
+                    if chunk.null_count() == 0 {
+                        norm_sq += chunk.values().iter().map(|&x| x * 
x).sum::<f64>();
+                    } else {
+                        norm_sq += chunk.iter().map(|opt| 
opt.unwrap_or(0.0).powi(2)).sum::<f64>();
+                    }
+                }
+                let norm = norm_sq.sqrt();
+                if norm == 0.0 {
+                    return Err(MahoutError::InvalidInput("Input data has zero 
norm".to_string()));
+                }
+                norm
+            };
+
+            let mut current_offset = 0;
+            for chunk in chunks {
+                let chunk_len = chunk.len();
+                if chunk_len == 0 {
+                    continue;
+                }
+
+                let state_ptr_offset = unsafe {
+                    state_vector.ptr().cast::<u8>()
+                        .add(current_offset * 
std::mem::size_of::<qdp_kernels::CuDoubleComplex>())
+                        .cast::<std::ffi::c_void>()
+                };
+
+                let chunk_slice = {
+                    crate::profile_scope!("GPU::ChunkH2DCopy");
+                    if chunk.null_count() == 0 {
+                        device.htod_sync_copy(chunk.values())
+                    } else {
+                        let temp_vec: Vec<f64> = chunk.iter().map(|opt| 
opt.unwrap_or(0.0)).collect();
+                        device.htod_sync_copy(&temp_vec)
+                    }
+                    .map_err(|e| MahoutError::MemoryAllocation(format!("Failed 
to copy chunk: {:?}", e)))?
+                };
+
+                {
+                    crate::profile_scope!("GPU::KernelLaunch");
+                    let ret = unsafe {
+                        launch_amplitude_encode(
+                            *chunk_slice.device_ptr() as *const f64,
+                            state_ptr_offset,
+                            chunk_len,
+                            state_len,
+                            norm,
+                            std::ptr::null_mut(),
+                        )
+                    };
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(
+                            format!("Kernel launch failed: {} ({})", ret, 
cuda_error_to_string(ret))
+                        ));
+                    }
+                }
+
+                current_offset += chunk_len;
+            }
+
+            if total_len < state_len {
+                let padding_bytes = (state_len - total_len) * 
std::mem::size_of::<qdp_kernels::CuDoubleComplex>();
+                let tail_ptr = unsafe { state_vector.ptr().add(total_len) as 
*mut c_void };
+
+                unsafe {
+                    unsafe extern "C" {
+                        fn cudaMemsetAsync(devPtr: *mut c_void, value: i32, 
count: usize, stream: *mut c_void) -> i32;
+                    }
+                    let result = cudaMemsetAsync(tail_ptr, 0, padding_bytes, 
std::ptr::null_mut());
+                    if result != 0 {
+                        return Err(MahoutError::Cuda(
+                            format!("Failed to zero-fill padding: {} ({})", 
result, cuda_error_to_string(result))
+                        ));
+                    }
+                }
+            }
+
+            device.synchronize()
+                .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", 
e)))?;
+
+            Ok(state_vector)
+        }
+
+        #[cfg(not(target_os = "linux"))]
+        {
+            Err(MahoutError::Cuda("CUDA unavailable (non-Linux)".to_string()))
+        }
+    }

Review Comment:
   The new `encode_chunked` method lacks test coverage. Given that the 
repository has comprehensive tests for other encoding methods (see 
`qdp/qdp-core/tests/api_workflow.rs`, `validation.rs`, and `memory_safety.rs`), 
this new method should also be tested. Consider adding tests that verify:
   1. Successful chunked encoding with multiple chunks
   2. Handling of chunks with null values
   3. Edge cases like empty chunks or single-chunk scenarios
   4. Validation errors (empty chunks, data exceeding state vector size, 
invalid qubit counts)
   5. Proper GPU memory allocation and padding behavior



##########
qdp/qdp-core/src/gpu/encodings/amplitude.rs:
##########
@@ -121,6 +122,129 @@ impl QuantumEncoder for AmplitudeEncoder {
     fn description(&self) -> &'static str {
         "Amplitude encoding with L2 normalization"
     }
+
+    /// Override to avoid intermediate Vec allocation. Processes chunks 
directly to GPU offsets.
+    fn encode_chunked(
+        &self,
+        device: &Arc<CudaDevice>,
+        chunks: &[Float64Array],
+        num_qubits: usize,
+    ) -> Result<GpuStateVector> {
+        #[cfg(target_os = "linux")]
+        {
+            let total_len: usize = chunks.iter().map(|c| c.len()).sum();
+            let state_len = 1 << num_qubits;
+
+            if total_len == 0 {
+                return Err(MahoutError::InvalidInput("Input chunks cannot be 
empty".to_string()));
+            }
+            if total_len > state_len {
+                return Err(MahoutError::InvalidInput(
+                    format!("Total data length {} exceeds state vector size 
{}", total_len, state_len)
+                ));
+            }
+            if num_qubits == 0 || num_qubits > 30 {
+                return Err(MahoutError::InvalidInput(
+                    format!("Number of qubits {} must be between 1 and 30", 
num_qubits)
+                ));
+            }
+
+            let state_vector = {
+                crate::profile_scope!("GPU::Alloc");
+                GpuStateVector::new(device, num_qubits)?
+            };
+
+            let norm = {
+                crate::profile_scope!("CPU::L2Norm");
+                let mut norm_sq = 0.0;
+                for chunk in chunks {
+                    if chunk.null_count() == 0 {
+                        norm_sq += chunk.values().iter().map(|&x| x * 
x).sum::<f64>();
+                    } else {
+                        norm_sq += chunk.iter().map(|opt| 
opt.unwrap_or(0.0).powi(2)).sum::<f64>();
+                    }
+                }
+                let norm = norm_sq.sqrt();
+                if norm == 0.0 {
+                    return Err(MahoutError::InvalidInput("Input data has zero 
norm".to_string()));
+                }
+                norm
+            };
+
+            let mut current_offset = 0;
+            for chunk in chunks {
+                let chunk_len = chunk.len();
+                if chunk_len == 0 {
+                    continue;
+                }
+
+                let state_ptr_offset = unsafe {
+                    state_vector.ptr().cast::<u8>()
+                        .add(current_offset * 
std::mem::size_of::<qdp_kernels::CuDoubleComplex>())
+                        .cast::<std::ffi::c_void>()
+                };
+
+                let chunk_slice = {
+                    crate::profile_scope!("GPU::ChunkH2DCopy");
+                    if chunk.null_count() == 0 {
+                        device.htod_sync_copy(chunk.values())
+                    } else {
+                        let temp_vec: Vec<f64> = chunk.iter().map(|opt| 
opt.unwrap_or(0.0)).collect();
+                        device.htod_sync_copy(&temp_vec)
+                    }
+                    .map_err(|e| MahoutError::MemoryAllocation(format!("Failed 
to copy chunk: {:?}", e)))?
+                };
+
+                {
+                    crate::profile_scope!("GPU::KernelLaunch");
+                    let ret = unsafe {
+                        launch_amplitude_encode(
+                            *chunk_slice.device_ptr() as *const f64,
+                            state_ptr_offset,
+                            chunk_len,
+                            state_len,
+                            norm,
+                            std::ptr::null_mut(),
+                        )
+                    };
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(
+                            format!("Kernel launch failed: {} ({})", ret, 
cuda_error_to_string(ret))
+                        ));
+                    }
+                }
+
+                current_offset += chunk_len;
+            }
+
+            if total_len < state_len {
+                let padding_bytes = (state_len - total_len) * 
std::mem::size_of::<qdp_kernels::CuDoubleComplex>();
+                let tail_ptr = unsafe { state_vector.ptr().add(total_len) as 
*mut c_void };
+
+                unsafe {
+                    unsafe extern "C" {

Review Comment:
   Redundant `unsafe` keyword. The `extern "C"` block on line 225 is already 
inside an `unsafe` block (line 224). While this doesn't cause a compilation 
error, the inner `unsafe` on line 225 is redundant. Consider removing it for 
clarity: `extern "C" { ... }`.
   ```suggestion
                       extern "C" {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [QDP] improve amplitudeEncoders for less copy memory allocations [mahout]

Reply via email to