(mahout) branch main updated: feat: Implement GPU inverse L2 norm calculation for float32 input in AmplitudeEncoder (#971)

guanmingchiu Fri, 30 Jan 2026 19:44:41 -0800

This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/main by this push:
     new 546c18264 feat: Implement GPU inverse L2 norm calculation for float32 
input in AmplitudeEncoder (#971)
546c18264 is described below

commit 546c18264713ad504db82c22eedc1b97c7569b2e
Author: Vic Wen <[email protected]>
AuthorDate: Sat Jan 31 11:44:22 2026 +0800

    feat: Implement GPU inverse L2 norm calculation for float32 input in 
AmplitudeEncoder (#971)
---
 qdp/qdp-core/src/gpu/encodings/amplitude.rs | 58 ++++++++++++++++++++
 qdp/qdp-core/tests/gpu_norm_f32.rs          | 85 +++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)

diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs 
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 917336a08..f7846a058 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -41,6 +41,7 @@ use cudarc::driver::{DevicePtr, DevicePtrMut};
 #[cfg(target_os = "linux")]
 use qdp_kernels::{
     launch_amplitude_encode, launch_amplitude_encode_batch, launch_l2_norm, 
launch_l2_norm_batch,
+    launch_l2_norm_f32,
 };
 #[cfg(target_os = "linux")]
 use std::ffi::c_void;
@@ -490,4 +491,61 @@ impl AmplitudeEncoder {
 
         Ok(inv_norm)
     }
+
+    /// Compute inverse L2 norm on GPU for float32 input using the reduction 
kernel.
+    ///
+    /// # Arguments
+    /// * `device` - CUDA device reference
+    /// * `input_ptr` - Device pointer to input data (f32 array on GPU)
+    /// * `len` - Number of f32 elements
+    ///
+    /// # Returns
+    /// The inverse L2 norm (1/||x||_2) of the input data as `f32`.
+    ///
+    /// # Safety
+    /// The caller must ensure `input_ptr` points to valid GPU memory 
containing
+    /// at least `len` f32 elements on the same device as `device`.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn calculate_inv_norm_gpu_f32(
+        device: &Arc<CudaDevice>,
+        input_ptr: *const f32,
+        len: usize,
+    ) -> Result<f32> {
+        crate::profile_scope!("GPU::NormSingleF32");
+
+        let mut norm_buffer = device.alloc_zeros::<f32>(1).map_err(|e| {
+            MahoutError::MemoryAllocation(format!("Failed to allocate f32 norm 
buffer: {:?}", e))
+        })?;
+
+        let ret = unsafe {
+            launch_l2_norm_f32(
+                input_ptr,
+                len,
+                *norm_buffer.device_ptr_mut() as *mut f32,
+                std::ptr::null_mut(), // default stream
+            )
+        };
+
+        if ret != 0 {
+            return Err(MahoutError::KernelLaunch(format!(
+                "Norm kernel f32 failed: {} ({})",
+                ret,
+                cuda_error_to_string(ret)
+            )));
+        }
+
+        let inv_norm_host = device
+            .dtoh_sync_copy(&norm_buffer)
+            .map_err(|e| MahoutError::Cuda(format!("Failed to copy f32 norm to 
host: {:?}", e)))?;
+
+        let inv_norm = inv_norm_host.first().copied().unwrap_or(0.0);
+        if inv_norm == 0.0 || !inv_norm.is_finite() {
+            return Err(MahoutError::InvalidInput(
+                "Input data (f32) has zero or non-finite norm (contains NaN, 
Inf, or all zeros)"
+                    .to_string(),
+            ));
+        }
+
+        Ok(inv_norm)
+    }
 }
diff --git a/qdp/qdp-core/tests/gpu_norm_f32.rs 
b/qdp/qdp-core/tests/gpu_norm_f32.rs
new file mode 100644
index 000000000..40be53efe
--- /dev/null
+++ b/qdp/qdp-core/tests/gpu_norm_f32.rs
@@ -0,0 +1,85 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Tests for GPU-side f32 L2 norm helper in AmplitudeEncoder.
+//
+
+#![cfg(target_os = "linux")]
+
+use approx::assert_relative_eq;
+use cudarc::driver::{CudaDevice, DevicePtr};
+use qdp_core::gpu::encodings::amplitude::AmplitudeEncoder;
+
+#[test]
+fn test_calculate_inv_norm_gpu_f32_basic() {
+    println!("Testing AmplitudeEncoder::calculate_inv_norm_gpu_f32 (basic 
case)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    // Input: [3.0, 4.0] -> norm = 5.0, inv_norm = 0.2
+    let input: Vec<f32> = vec![3.0, 4.0];
+    let expected_norm = (3.0_f32.powi(2) + 4.0_f32.powi(2)).sqrt();
+    let expected_inv_norm = 1.0_f32 / expected_norm;
+
+    let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
+    let inv = unsafe {
+        AmplitudeEncoder::calculate_inv_norm_gpu_f32(
+            &device,
+            *input_d.device_ptr() as *const f32,
+            input.len(),
+        )
+        .unwrap()
+    };
+
+    assert_relative_eq!(inv, expected_inv_norm, epsilon = 1e-6_f32);
+}
+
+#[test]
+fn test_calculate_inv_norm_gpu_f32_invalid_zero() {
+    println!("Testing AmplitudeEncoder::calculate_inv_norm_gpu_f32 with zero 
vector...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input: Vec<f32> = vec![0.0, 0.0, 0.0];
+    let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
+
+    let result = unsafe {
+        AmplitudeEncoder::calculate_inv_norm_gpu_f32(
+            &device,
+            *input_d.device_ptr() as *const f32,
+            input.len(),
+        )
+    };
+
+    assert!(
+        result.is_err(),
+        "Expected error for zero-norm f32 input, got {:?}",
+        result
+    );
+}

(mahout) branch main updated: feat: Implement GPU inverse L2 norm calculation for float32 input in AmplitudeEncoder (#971)

Reply via email to