This is an automated email from the ASF dual-hosted git repository.
guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 546c18264 feat: Implement GPU inverse L2 norm calculation for float32
input in AmplitudeEncoder (#971)
546c18264 is described below
commit 546c18264713ad504db82c22eedc1b97c7569b2e
Author: Vic Wen <[email protected]>
AuthorDate: Sat Jan 31 11:44:22 2026 +0800
feat: Implement GPU inverse L2 norm calculation for float32 input in
AmplitudeEncoder (#971)
---
qdp/qdp-core/src/gpu/encodings/amplitude.rs | 58 ++++++++++++++++++++
qdp/qdp-core/tests/gpu_norm_f32.rs | 85 +++++++++++++++++++++++++++++
2 files changed, 143 insertions(+)
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 917336a08..f7846a058 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -41,6 +41,7 @@ use cudarc::driver::{DevicePtr, DevicePtrMut};
#[cfg(target_os = "linux")]
use qdp_kernels::{
launch_amplitude_encode, launch_amplitude_encode_batch, launch_l2_norm,
launch_l2_norm_batch,
+ launch_l2_norm_f32,
};
#[cfg(target_os = "linux")]
use std::ffi::c_void;
@@ -490,4 +491,61 @@ impl AmplitudeEncoder {
Ok(inv_norm)
}
+
+ /// Compute inverse L2 norm on GPU for float32 input using the reduction
kernel.
+ ///
+ /// # Arguments
+ /// * `device` - CUDA device reference
+ /// * `input_ptr` - Device pointer to input data (f32 array on GPU)
+ /// * `len` - Number of f32 elements
+ ///
+ /// # Returns
+ /// The inverse L2 norm (1/||x||_2) of the input data as `f32`.
+ ///
+ /// # Safety
+ /// The caller must ensure `input_ptr` points to valid GPU memory
containing
+ /// at least `len` f32 elements on the same device as `device`.
+ #[cfg(target_os = "linux")]
+ pub unsafe fn calculate_inv_norm_gpu_f32(
+ device: &Arc<CudaDevice>,
+ input_ptr: *const f32,
+ len: usize,
+ ) -> Result<f32> {
+ crate::profile_scope!("GPU::NormSingleF32");
+
+ let mut norm_buffer = device.alloc_zeros::<f32>(1).map_err(|e| {
+ MahoutError::MemoryAllocation(format!("Failed to allocate f32 norm
buffer: {:?}", e))
+ })?;
+
+ let ret = unsafe {
+ launch_l2_norm_f32(
+ input_ptr,
+ len,
+ *norm_buffer.device_ptr_mut() as *mut f32,
+ std::ptr::null_mut(), // default stream
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Norm kernel f32 failed: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+
+ let inv_norm_host = device
+ .dtoh_sync_copy(&norm_buffer)
+ .map_err(|e| MahoutError::Cuda(format!("Failed to copy f32 norm to
host: {:?}", e)))?;
+
+ let inv_norm = inv_norm_host.first().copied().unwrap_or(0.0);
+ if inv_norm == 0.0 || !inv_norm.is_finite() {
+ return Err(MahoutError::InvalidInput(
+ "Input data (f32) has zero or non-finite norm (contains NaN,
Inf, or all zeros)"
+ .to_string(),
+ ));
+ }
+
+ Ok(inv_norm)
+ }
}
diff --git a/qdp/qdp-core/tests/gpu_norm_f32.rs
b/qdp/qdp-core/tests/gpu_norm_f32.rs
new file mode 100644
index 000000000..40be53efe
--- /dev/null
+++ b/qdp/qdp-core/tests/gpu_norm_f32.rs
@@ -0,0 +1,85 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Tests for GPU-side f32 L2 norm helper in AmplitudeEncoder.
+//
+
+#![cfg(target_os = "linux")]
+
+use approx::assert_relative_eq;
+use cudarc::driver::{CudaDevice, DevicePtr};
+use qdp_core::gpu::encodings::amplitude::AmplitudeEncoder;
+
+#[test]
+fn test_calculate_inv_norm_gpu_f32_basic() {
+ println!("Testing AmplitudeEncoder::calculate_inv_norm_gpu_f32 (basic
case)...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ // Input: [3.0, 4.0] -> norm = 5.0, inv_norm = 0.2
+ let input: Vec<f32> = vec![3.0, 4.0];
+ let expected_norm = (3.0_f32.powi(2) + 4.0_f32.powi(2)).sqrt();
+ let expected_inv_norm = 1.0_f32 / expected_norm;
+
+ let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
+ let inv = unsafe {
+ AmplitudeEncoder::calculate_inv_norm_gpu_f32(
+ &device,
+ *input_d.device_ptr() as *const f32,
+ input.len(),
+ )
+ .unwrap()
+ };
+
+ assert_relative_eq!(inv, expected_inv_norm, epsilon = 1e-6_f32);
+}
+
+#[test]
+fn test_calculate_inv_norm_gpu_f32_invalid_zero() {
+ println!("Testing AmplitudeEncoder::calculate_inv_norm_gpu_f32 with zero
vector...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ let input: Vec<f32> = vec![0.0, 0.0, 0.0];
+ let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
+
+ let result = unsafe {
+ AmplitudeEncoder::calculate_inv_norm_gpu_f32(
+ &device,
+ *input_d.device_ptr() as *const f32,
+ input.len(),
+ )
+ };
+
+ assert!(
+ result.is_err(),
+ "Expected error for zero-norm f32 input, got {:?}",
+ result
+ );
+}