This is an automated email from the ASF dual-hosted git repository.
hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 8d5b78935 MAHOUT-802: Add float32 L2 norm reduction kernel for batch
processing (#918)
8d5b78935 is described below
commit 8d5b789352337ba1050b3c0e51a48c992fcf6613
Author: Vic Wen <[email protected]>
AuthorDate: Thu Jan 29 16:02:11 2026 +0800
MAHOUT-802: Add float32 L2 norm reduction kernel for batch processing (#918)
* feat: Add float32 support for L2 norm reduction kernels and related
functions
* test: add float32 L2 norm reduction tests for single kernels
* feat: Implement float32 L2 norm batch reduction kernel and launch function
* test: Add float32 L2 norm batch reduction test for CUDA kernel
* fix: include qdp_no_cuda flag
* feat: added error handling for num_samples exceeds CUDA_MAX_GRID_DIM_1D.
---
qdp/qdp-kernels/src/amplitude.cu | 104 ++++++++++++++++++++++++++++++
qdp/qdp-kernels/src/lib.rs | 25 +++++++
qdp/qdp-kernels/tests/amplitude_encode.rs | 100 +++++++++++++++++++++++++++-
3 files changed, 228 insertions(+), 1 deletion(-)
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index 0a31c53b6..aa21059e4 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -455,6 +455,46 @@ __global__ void l2_norm_batch_kernel(
}
}
+/// Kernel: accumulate L2 norms for a batch (float32).
+/// Grid is organized as (blocks_per_sample * num_samples) blocks.
+__global__ void l2_norm_batch_kernel_f32(
+ const float* __restrict__ input_batch,
+ size_t num_samples,
+ size_t sample_len,
+ size_t blocks_per_sample,
+ float* __restrict__ out_norms
+) {
+ const size_t sample_idx = blockIdx.x / blocks_per_sample;
+ if (sample_idx >= num_samples) return;
+
+ const size_t block_in_sample = blockIdx.x % blocks_per_sample;
+ const size_t base = sample_idx * sample_len;
+
+ const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
+ const size_t stride = blockDim.x * blocks_per_sample;
+
+ float local_sum = 0.0f;
+
+ size_t vec_offset = vec_idx;
+ size_t offset = vec_offset * 2;
+ while (offset + 1 < sample_len) {
+ const float2 v = __ldg(reinterpret_cast<const float2*>(input_batch +
base) + vec_offset);
+ local_sum += v.x * v.x + v.y * v.y;
+ vec_offset += stride;
+ offset = vec_offset * 2;
+ }
+
+ if (offset < sample_len) {
+ const float v = __ldg(input_batch + base + offset);
+ local_sum += v * v;
+ }
+
+ const float block_sum = block_reduce_sum_f32(local_sum);
+ if (threadIdx.x == 0) {
+ atomicAdd(out_norms + sample_idx, block_sum);
+ }
+}
+
/// Kernel: converts accumulated sum-of-squares into inverse norms.
__global__ void finalize_inv_norm_kernel(
double* __restrict__ norms,
@@ -637,6 +677,70 @@ int launch_l2_norm_batch(
return (int)cudaGetLastError();
}
+/// Launch L2 norm reduction for a batch of vectors (float32).
+/// Writes inverse norms for each sample into `inv_norms_out_d`.
+int launch_l2_norm_batch_f32(
+ const float* input_batch_d,
+ size_t num_samples,
+ size_t sample_len,
+ float* inv_norms_out_d,
+ cudaStream_t stream
+) {
+ if (num_samples == 0 || sample_len == 0) {
+ return cudaErrorInvalidValue;
+ }
+
+ cudaError_t memset_status = cudaMemsetAsync(
+ inv_norms_out_d,
+ 0,
+ num_samples * sizeof(float),
+ stream
+ );
+ if (memset_status != cudaSuccess) {
+ return memset_status;
+ }
+
+ const int blockSize = DEFAULT_BLOCK_SIZE;
+ const size_t elements_per_block = blockSize * 2; // float2 per thread
+ const size_t max_grid = CUDA_MAX_GRID_DIM_1D; // CUDA grid dimension limit
for 1D launch
+ if (num_samples > max_grid) {
+ return cudaErrorInvalidValue;
+ }
+
+ size_t blocks_per_sample = (sample_len + elements_per_block - 1) /
elements_per_block;
+ const size_t max_blocks_per_sample = MAX_BLOCKS_PER_SAMPLE;
+ if (blocks_per_sample == 0) blocks_per_sample = 1;
+ if (blocks_per_sample > max_blocks_per_sample) {
+ blocks_per_sample = max_blocks_per_sample;
+ }
+
+ size_t gridSize = num_samples * blocks_per_sample;
+ if (gridSize > max_grid) {
+ blocks_per_sample = max_grid / num_samples;
+ if (blocks_per_sample == 0) {
+ blocks_per_sample = 1;
+ }
+ gridSize = num_samples * blocks_per_sample;
+ }
+
+ l2_norm_batch_kernel_f32<<<gridSize, blockSize, 0, stream>>>(
+ input_batch_d,
+ num_samples,
+ sample_len,
+ blocks_per_sample,
+ inv_norms_out_d
+ );
+
+ const int finalizeBlock = FINALIZE_BLOCK_SIZE;
+ const int finalizeGrid = (num_samples + finalizeBlock - 1) / finalizeBlock;
+ finalize_inv_norm_kernel_f32<<<finalizeGrid, finalizeBlock, 0, stream>>>(
+ inv_norms_out_d,
+ num_samples
+ );
+
+ return (int)cudaGetLastError();
+}
+
/// Kernel: convert complex128 state vector to complex64.
__global__ void convert_state_to_complex64_kernel(
const cuDoubleComplex* __restrict__ input_state,
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index de00a2e6b..e4d0b9976 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -135,6 +135,19 @@ unsafe extern "C" {
stream: *mut c_void,
) -> i32;
+ /// Launch batched L2 norm reduction (returns inverse norms per sample)
for float32
+ /// Returns CUDA error code (0 = success)
+ ///
+ /// # Safety
+ /// Pointers must reference valid device memory on the provided stream.
+ pub fn launch_l2_norm_batch_f32(
+ input_batch_d: *const f32,
+ num_samples: usize,
+ sample_len: usize,
+ inv_norms_out_d: *mut f32,
+ stream: *mut c_void,
+ ) -> i32;
+
/// Convert a complex128 state vector to complex64 on GPU.
/// Returns CUDA error code (0 = success).
///
@@ -307,6 +320,18 @@ pub extern "C" fn launch_l2_norm_f32(
999
}
+#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[unsafe(no_mangle)]
+pub extern "C" fn launch_l2_norm_batch_f32(
+ _input_batch_d: *const f32,
+ _num_samples: usize,
+ _sample_len: usize,
+ _inv_norms_out_d: *mut f32,
+ _stream: *mut c_void,
+) -> i32 {
+ 999
+}
+
#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
#[unsafe(no_mangle)]
pub extern "C" fn convert_state_to_float(
diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs
b/qdp/qdp-kernels/tests/amplitude_encode.rs
index 5ff9ec1b4..db68dd012 100644
--- a/qdp/qdp-kernels/tests/amplitude_encode.rs
+++ b/qdp/qdp-kernels/tests/amplitude_encode.rs
@@ -26,7 +26,7 @@ use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
#[cfg(target_os = "linux")]
use qdp_kernels::{
CuComplex, CuDoubleComplex, launch_amplitude_encode,
launch_amplitude_encode_f32,
- launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_f32,
+ launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32,
launch_l2_norm_f32,
};
const EPSILON: f64 = 1e-10;
@@ -717,6 +717,104 @@ fn test_l2_norm_single_kernel_f32() {
println!("PASS: Single norm reduction (float32) matches CPU");
}
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_f32() {
+ println!("Testing batched L2 norm reduction kernel (float32)...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ // Test batch: [[3.0, 4.0], [1.0, 1.0], [5.0, 12.0]]
+ let batch: Vec<f32> = vec![3.0, 4.0, 1.0, 1.0, 5.0, 12.0];
+ let num_samples = 3;
+ let sample_len = 2;
+
+ let expected: Vec<f32> = vec![
+ 1.0 / (3.0_f32.powi(2) + 4.0_f32.powi(2)).sqrt(), // 0.2
+ 1.0 / (1.0_f32.powi(2) + 1.0_f32.powi(2)).sqrt(), // ~0.707
+ 1.0 / (5.0_f32.powi(2) + 12.0_f32.powi(2)).sqrt(), // ~0.077
+ ];
+
+ let batch_d = device.htod_sync_copy(batch.as_slice()).unwrap();
+ let mut norms_d = device.alloc_zeros::<f32>(num_samples).unwrap();
+
+ let status = unsafe {
+ launch_l2_norm_batch_f32(
+ *batch_d.device_ptr() as *const f32,
+ num_samples,
+ sample_len,
+ *norms_d.device_ptr_mut() as *mut f32,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(status, 0, "Batch norm kernel should succeed");
+ device.synchronize().unwrap();
+
+ let norms_h = device.dtoh_sync_copy(&norms_d).unwrap();
+
+ for (i, (got, expect)) in norms_h.iter().zip(expected.iter()).enumerate() {
+ assert!(
+ (got - expect).abs() < EPSILON_F32,
+ "Sample {} inv norm mismatch: expected {}, got {}",
+ i,
+ expect,
+ got
+ );
+ }
+
+ println!("PASS: Batched norm reduction (float32) matches CPU");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_grid_limit_f32() {
+ println!("Testing batched L2 norm reduction with grid limit boundary
(float32)...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ // Test that num_samples exceeding CUDA_MAX_GRID_DIM_1D (65535) returns
error
+ const MAX_GRID_DIM: usize = 65535;
+ let num_samples = MAX_GRID_DIM + 1; // Exceeds limit
+ let sample_len = 2;
+
+ let input: Vec<f32> = vec![1.0; num_samples * sample_len];
+ let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
+ let mut norms_d = device.alloc_zeros::<f32>(num_samples).unwrap();
+
+ let status = unsafe {
+ launch_l2_norm_batch_f32(
+ *input_d.device_ptr() as *const f32,
+ num_samples,
+ sample_len,
+ *norms_d.device_ptr_mut() as *mut f32,
+ std::ptr::null_mut(),
+ )
+ };
+
+ // Should return error because num_samples exceeds grid limit
+ // cudaErrorInvalidValue = 1 (from cuda_error_to_string)
+ assert_eq!(
+ status, 1,
+ "Should reject num_samples exceeding CUDA_MAX_GRID_DIM_1D (f32), got
error code {}",
+ status
+ );
+
+ println!("PASS: Correctly rejected num_samples exceeding grid limit
(f32)");
+}
+
#[test]
#[cfg(not(target_os = "linux"))]
fn test_amplitude_encode_dummy_non_linux() {