This is an automated email from the ASF dual-hosted git repository. guanmingchiu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/mahout.git
commit c79f06a3b0270ae47a4d9b68fd111e79831f2d6e Author: Ping <[email protected]> AuthorDate: Thu Dec 25 19:51:10 2025 +0800 [QDP] Add launch_amplitude_encode_f32 function (#739) * float32 follow up Signed-off-by: 400Ping <[email protected]> * update Signed-off-by: 400Ping <[email protected]> --------- Signed-off-by: 400Ping <[email protected]> --- qdp/qdp-kernels/src/amplitude.cu | 67 ++++++++++++++++++++++++++++--- qdp/qdp-kernels/src/lib.rs | 27 +++++++++++++ qdp/qdp-kernels/tests/amplitude_encode.rs | 59 ++++++++++++++++++++++++++- 3 files changed, 147 insertions(+), 6 deletions(-) diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu index ea5fc27f7..7cf94ce92 100644 --- a/qdp/qdp-kernels/src/amplitude.cu +++ b/qdp/qdp-kernels/src/amplitude.cu @@ -41,18 +41,17 @@ __global__ void amplitude_encode_kernel( // Vectorized Load Optimization: // If we are well within bounds, treat input as double2 to issue a single 128-bit load instruction. - // This reduces memory transactions and improves throughput on RTX cards. + // Use __ldg() to pull through the read-only cache; cudaMalloc aligns to 256 bytes so the + // reinterpret_cast<double2*> load is naturally aligned. if (state_idx_base + 1 < input_len) { // Reinterpret cast to load two doubles at once - // Note: Assumes input is reasonably aligned (standard cudaMalloc provides 256-byte alignment) - const double2* input_vec = reinterpret_cast<const double2*>(input); - double2 loaded = input_vec[idx]; + const double2 loaded = __ldg(reinterpret_cast<const double2*>(input) + idx); v1 = loaded.x; v2 = loaded.y; } // Handle edge case: Odd input length else if (state_idx_base < input_len) { - v1 = input[state_idx_base]; + v1 = __ldg(input + state_idx_base); // v2 remains 0.0 } @@ -66,6 +65,35 @@ __global__ void amplitude_encode_kernel( } } +__global__ void amplitude_encode_kernel_f32( + const float* __restrict__ input, + cuComplex* __restrict__ state, + size_t input_len, + size_t state_len, + float inv_norm +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t state_idx_base = idx * 2; + if (state_idx_base >= state_len) return; + + float v1 = 0.0f; + float v2 = 0.0f; + + if (state_idx_base + 1 < input_len) { + // Mirror the double kernel: cached vectorized load for two floats + const float2 loaded = __ldg(reinterpret_cast<const float2*>(input) + idx); + v1 = loaded.x; + v2 = loaded.y; + } else if (state_idx_base < input_len) { + v1 = __ldg(input + state_idx_base); + } + + state[state_idx_base] = make_cuComplex(v1 * inv_norm, 0.0f); + if (state_idx_base + 1 < state_len) { + state[state_idx_base + 1] = make_cuComplex(v2 * inv_norm, 0.0f); + } +} + // Warp-level reduction for sum using shuffle instructions __device__ __forceinline__ double warp_reduce_sum(double val) { for (int offset = warpSize / 2; offset > 0; offset >>= 1) { @@ -137,6 +165,35 @@ int launch_amplitude_encode( return (int)cudaGetLastError(); } +/// Launch amplitude encoding kernel for float32 +int launch_amplitude_encode_f32( + const float* input_d, + void* state_d, + size_t input_len, + size_t state_len, + float inv_norm, + cudaStream_t stream +) { + if (inv_norm <= 0.0f || !isfinite(inv_norm)) { + return cudaErrorInvalidValue; + } + + cuComplex* state_complex_d = static_cast<cuComplex*>(state_d); + + const int blockSize = 256; + const int gridSize = (state_len / 2 + blockSize - 1) / blockSize; + + amplitude_encode_kernel_f32<<<gridSize, blockSize, 0, stream>>>( + input_d, + state_complex_d, + input_len, + state_len, + inv_norm + ); + + return (int)cudaGetLastError(); +} + /// Optimized batch amplitude encoding kernel /// /// Memory Layout (row-major): diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs index bae8782ef..d9fc0a163 100644 --- a/qdp/qdp-kernels/src/lib.rs +++ b/qdp/qdp-kernels/src/lib.rs @@ -69,6 +69,20 @@ unsafe extern "C" { stream: *mut c_void, ) -> i32; + /// Launch amplitude encoding kernel (float32 input/output) + /// Returns CUDA error code (0 = success) + /// + /// # Safety + /// Requires valid GPU pointers, must sync before freeing + pub fn launch_amplitude_encode_f32( + input_d: *const f32, + state_d: *mut c_void, + input_len: usize, + state_len: usize, + inv_norm: f32, + stream: *mut c_void, + ) -> i32; + /// Launch batch amplitude encoding kernel /// Returns CUDA error code (0 = success) /// @@ -138,6 +152,19 @@ pub extern "C" fn launch_amplitude_encode( 999 // Error: CUDA unavailable } +#[cfg(not(target_os = "linux"))] +#[unsafe(no_mangle)] +pub extern "C" fn launch_amplitude_encode_f32( + _input_d: *const f32, + _state_d: *mut c_void, + _input_len: usize, + _state_len: usize, + _inv_norm: f32, + _stream: *mut c_void, +) -> i32 { + 999 +} + #[cfg(not(target_os = "linux"))] #[unsafe(no_mangle)] pub extern "C" fn launch_l2_norm( diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs b/qdp/qdp-kernels/tests/amplitude_encode.rs index e290d550c..4223dd0bb 100644 --- a/qdp/qdp-kernels/tests/amplitude_encode.rs +++ b/qdp/qdp-kernels/tests/amplitude_encode.rs @@ -19,9 +19,17 @@ #[cfg(target_os = "linux")] use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut}; #[cfg(target_os = "linux")] -use qdp_kernels::{CuDoubleComplex, launch_amplitude_encode, launch_l2_norm, launch_l2_norm_batch}; +use qdp_kernels::{ + CuComplex, + CuDoubleComplex, + launch_amplitude_encode, + launch_amplitude_encode_f32, + launch_l2_norm, + launch_l2_norm_batch, +}; const EPSILON: f64 = 1e-10; +const EPSILON_F32: f32 = 1e-5; #[test] #[cfg(target_os = "linux")] @@ -94,6 +102,55 @@ fn test_amplitude_encode_basic() { println!("PASS: Basic amplitude encoding works correctly"); } +#[test] +#[cfg(target_os = "linux")] +fn test_amplitude_encode_basic_f32() { + println!("Testing basic amplitude encoding (float32)..."); + + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + let input: Vec<f32> = vec![3.0, 4.0]; + let norm = (input[0] * input[0] + input[1] * input[1]).sqrt(); + let inv_norm = 1.0f32 / norm; + let state_len = 4usize; + + let input_d = device.htod_copy(input.clone()).unwrap(); + let mut state_d = device.alloc_zeros::<CuComplex>(state_len).unwrap(); + + let result = unsafe { + launch_amplitude_encode_f32( + *input_d.device_ptr() as *const f32, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + input.len(), + state_len, + inv_norm, + std::ptr::null_mut(), + ) + }; + + assert_eq!(result, 0, "Kernel launch should succeed"); + + let state_h = device.dtoh_sync_copy(&state_d).unwrap(); + + assert!((state_h[0].x - 0.6).abs() < EPSILON_F32, "First element should be 0.6"); + assert!(state_h[0].y.abs() < EPSILON_F32, "First element imaginary should be 0"); + assert!((state_h[1].x - 0.8).abs() < EPSILON_F32, "Second element should be 0.8"); + assert!(state_h[1].y.abs() < EPSILON_F32, "Second element imaginary should be 0"); + assert!(state_h[2].x.abs() < EPSILON_F32, "Third element should be 0"); + assert!(state_h[3].x.abs() < EPSILON_F32, "Fourth element should be 0"); + + let total_prob: f32 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum(); + assert!((total_prob - 1.0).abs() < EPSILON_F32, "Total probability should be 1.0"); + + println!("PASS: Basic float32 amplitude encoding works correctly"); +} + #[test] #[cfg(target_os = "linux")] fn test_amplitude_encode_power_of_two() {
