(mahout) branch main updated: MAHOUT-802: Add float32 L2 norm reduction kernel for batch processing (#918)

hcr Thu, 29 Jan 2026 00:02:33 -0800

This is an automated email from the ASF dual-hosted git repository.

hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/main by this push:
     new 8d5b78935 MAHOUT-802: Add float32 L2 norm reduction kernel for batch 
processing (#918)
8d5b78935 is described below

commit 8d5b789352337ba1050b3c0e51a48c992fcf6613
Author: Vic Wen <[email protected]>
AuthorDate: Thu Jan 29 16:02:11 2026 +0800

    MAHOUT-802: Add float32 L2 norm reduction kernel for batch processing (#918)
    
    * feat: Add float32 support for L2 norm reduction kernels and related 
functions
    
    * test: add float32 L2 norm reduction tests for single kernels
    
    * feat: Implement float32 L2 norm batch reduction kernel and launch function
    
    * test: Add float32 L2 norm batch reduction test for CUDA kernel
    
    * fix: include qdp_no_cuda flag
    
    * feat: added error handling for num_samples exceeds CUDA_MAX_GRID_DIM_1D.
---
 qdp/qdp-kernels/src/amplitude.cu          | 104 ++++++++++++++++++++++++++++++
 qdp/qdp-kernels/src/lib.rs                |  25 +++++++
 qdp/qdp-kernels/tests/amplitude_encode.rs | 100 +++++++++++++++++++++++++++-
 3 files changed, 228 insertions(+), 1 deletion(-)

diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index 0a31c53b6..aa21059e4 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -455,6 +455,46 @@ __global__ void l2_norm_batch_kernel(
     }
 }
 
+/// Kernel: accumulate L2 norms for a batch (float32).
+/// Grid is organized as (blocks_per_sample * num_samples) blocks.
+__global__ void l2_norm_batch_kernel_f32(
+    const float* __restrict__ input_batch,
+    size_t num_samples,
+    size_t sample_len,
+    size_t blocks_per_sample,
+    float* __restrict__ out_norms
+) {
+    const size_t sample_idx = blockIdx.x / blocks_per_sample;
+    if (sample_idx >= num_samples) return;
+
+    const size_t block_in_sample = blockIdx.x % blocks_per_sample;
+    const size_t base = sample_idx * sample_len;
+
+    const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
+    const size_t stride = blockDim.x * blocks_per_sample;
+
+    float local_sum = 0.0f;
+
+    size_t vec_offset = vec_idx;
+    size_t offset = vec_offset * 2;
+    while (offset + 1 < sample_len) {
+        const float2 v = __ldg(reinterpret_cast<const float2*>(input_batch + 
base) + vec_offset);
+        local_sum += v.x * v.x + v.y * v.y;
+        vec_offset += stride;
+        offset = vec_offset * 2;
+    }
+
+    if (offset < sample_len) {
+        const float v = __ldg(input_batch + base + offset);
+        local_sum += v * v;
+    }
+
+    const float block_sum = block_reduce_sum_f32(local_sum);
+    if (threadIdx.x == 0) {
+        atomicAdd(out_norms + sample_idx, block_sum);
+    }
+}
+
 /// Kernel: converts accumulated sum-of-squares into inverse norms.
 __global__ void finalize_inv_norm_kernel(
     double* __restrict__ norms,
@@ -637,6 +677,70 @@ int launch_l2_norm_batch(
     return (int)cudaGetLastError();
 }
 
+/// Launch L2 norm reduction for a batch of vectors (float32).
+/// Writes inverse norms for each sample into `inv_norms_out_d`.
+int launch_l2_norm_batch_f32(
+    const float* input_batch_d,
+    size_t num_samples,
+    size_t sample_len,
+    float* inv_norms_out_d,
+    cudaStream_t stream
+) {
+    if (num_samples == 0 || sample_len == 0) {
+        return cudaErrorInvalidValue;
+    }
+
+    cudaError_t memset_status = cudaMemsetAsync(
+        inv_norms_out_d,
+        0,
+        num_samples * sizeof(float),
+        stream
+    );
+    if (memset_status != cudaSuccess) {
+        return memset_status;
+    }
+
+    const int blockSize = DEFAULT_BLOCK_SIZE;
+    const size_t elements_per_block = blockSize * 2; // float2 per thread
+    const size_t max_grid = CUDA_MAX_GRID_DIM_1D; // CUDA grid dimension limit 
for 1D launch
+    if (num_samples > max_grid) {
+        return cudaErrorInvalidValue;
+    }
+
+    size_t blocks_per_sample = (sample_len + elements_per_block - 1) / 
elements_per_block;
+    const size_t max_blocks_per_sample = MAX_BLOCKS_PER_SAMPLE;
+    if (blocks_per_sample == 0) blocks_per_sample = 1;
+    if (blocks_per_sample > max_blocks_per_sample) {
+        blocks_per_sample = max_blocks_per_sample;
+    }
+
+    size_t gridSize = num_samples * blocks_per_sample;
+    if (gridSize > max_grid) {
+        blocks_per_sample = max_grid / num_samples;
+        if (blocks_per_sample == 0) {
+            blocks_per_sample = 1;
+        }
+        gridSize = num_samples * blocks_per_sample;
+    }
+
+    l2_norm_batch_kernel_f32<<<gridSize, blockSize, 0, stream>>>(
+        input_batch_d,
+        num_samples,
+        sample_len,
+        blocks_per_sample,
+        inv_norms_out_d
+    );
+
+    const int finalizeBlock = FINALIZE_BLOCK_SIZE;
+    const int finalizeGrid = (num_samples + finalizeBlock - 1) / finalizeBlock;
+    finalize_inv_norm_kernel_f32<<<finalizeGrid, finalizeBlock, 0, stream>>>(
+        inv_norms_out_d,
+        num_samples
+    );
+
+    return (int)cudaGetLastError();
+}
+
 /// Kernel: convert complex128 state vector to complex64.
 __global__ void convert_state_to_complex64_kernel(
     const cuDoubleComplex* __restrict__ input_state,
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index de00a2e6b..e4d0b9976 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -135,6 +135,19 @@ unsafe extern "C" {
         stream: *mut c_void,
     ) -> i32;
 
+    /// Launch batched L2 norm reduction (returns inverse norms per sample) 
for float32
+    /// Returns CUDA error code (0 = success)
+    ///
+    /// # Safety
+    /// Pointers must reference valid device memory on the provided stream.
+    pub fn launch_l2_norm_batch_f32(
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_len: usize,
+        inv_norms_out_d: *mut f32,
+        stream: *mut c_void,
+    ) -> i32;
+
     /// Convert a complex128 state vector to complex64 on GPU.
     /// Returns CUDA error code (0 = success).
     ///
@@ -307,6 +320,18 @@ pub extern "C" fn launch_l2_norm_f32(
     999
 }
 
+#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[unsafe(no_mangle)]
+pub extern "C" fn launch_l2_norm_batch_f32(
+    _input_batch_d: *const f32,
+    _num_samples: usize,
+    _sample_len: usize,
+    _inv_norms_out_d: *mut f32,
+    _stream: *mut c_void,
+) -> i32 {
+    999
+}
+
 #[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn convert_state_to_float(
diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs 
b/qdp/qdp-kernels/tests/amplitude_encode.rs
index 5ff9ec1b4..db68dd012 100644
--- a/qdp/qdp-kernels/tests/amplitude_encode.rs
+++ b/qdp/qdp-kernels/tests/amplitude_encode.rs
@@ -26,7 +26,7 @@ use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
 #[cfg(target_os = "linux")]
 use qdp_kernels::{
     CuComplex, CuDoubleComplex, launch_amplitude_encode, 
launch_amplitude_encode_f32,
-    launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_f32,
+    launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32, 
launch_l2_norm_f32,
 };
 
 const EPSILON: f64 = 1e-10;
@@ -717,6 +717,104 @@ fn test_l2_norm_single_kernel_f32() {
     println!("PASS: Single norm reduction (float32) matches CPU");
 }
 
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_f32() {
+    println!("Testing batched L2 norm reduction kernel (float32)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    // Test batch: [[3.0, 4.0], [1.0, 1.0], [5.0, 12.0]]
+    let batch: Vec<f32> = vec![3.0, 4.0, 1.0, 1.0, 5.0, 12.0];
+    let num_samples = 3;
+    let sample_len = 2;
+
+    let expected: Vec<f32> = vec![
+        1.0 / (3.0_f32.powi(2) + 4.0_f32.powi(2)).sqrt(), // 0.2
+        1.0 / (1.0_f32.powi(2) + 1.0_f32.powi(2)).sqrt(), // ~0.707
+        1.0 / (5.0_f32.powi(2) + 12.0_f32.powi(2)).sqrt(), // ~0.077
+    ];
+
+    let batch_d = device.htod_sync_copy(batch.as_slice()).unwrap();
+    let mut norms_d = device.alloc_zeros::<f32>(num_samples).unwrap();
+
+    let status = unsafe {
+        launch_l2_norm_batch_f32(
+            *batch_d.device_ptr() as *const f32,
+            num_samples,
+            sample_len,
+            *norms_d.device_ptr_mut() as *mut f32,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(status, 0, "Batch norm kernel should succeed");
+    device.synchronize().unwrap();
+
+    let norms_h = device.dtoh_sync_copy(&norms_d).unwrap();
+
+    for (i, (got, expect)) in norms_h.iter().zip(expected.iter()).enumerate() {
+        assert!(
+            (got - expect).abs() < EPSILON_F32,
+            "Sample {} inv norm mismatch: expected {}, got {}",
+            i,
+            expect,
+            got
+        );
+    }
+
+    println!("PASS: Batched norm reduction (float32) matches CPU");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_grid_limit_f32() {
+    println!("Testing batched L2 norm reduction with grid limit boundary 
(float32)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    // Test that num_samples exceeding CUDA_MAX_GRID_DIM_1D (65535) returns 
error
+    const MAX_GRID_DIM: usize = 65535;
+    let num_samples = MAX_GRID_DIM + 1; // Exceeds limit
+    let sample_len = 2;
+
+    let input: Vec<f32> = vec![1.0; num_samples * sample_len];
+    let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
+    let mut norms_d = device.alloc_zeros::<f32>(num_samples).unwrap();
+
+    let status = unsafe {
+        launch_l2_norm_batch_f32(
+            *input_d.device_ptr() as *const f32,
+            num_samples,
+            sample_len,
+            *norms_d.device_ptr_mut() as *mut f32,
+            std::ptr::null_mut(),
+        )
+    };
+
+    // Should return error because num_samples exceeds grid limit
+    // cudaErrorInvalidValue = 1 (from cuda_error_to_string)
+    assert_eq!(
+        status, 1,
+        "Should reject num_samples exceeding CUDA_MAX_GRID_DIM_1D (f32), got 
error code {}",
+        status
+    );
+
+    println!("PASS: Correctly rejected num_samples exceeding grid limit 
(f32)");
+}
+
 #[test]
 #[cfg(not(target_os = "linux"))]
 fn test_amplitude_encode_dummy_non_linux() {

(mahout) branch main updated: MAHOUT-802: Add float32 L2 norm reduction kernel for batch processing (#918)

Reply via email to