Re: [PR] MAHOUT-802: Add float32 L2 norm reduction kernel for batch processing [mahout]

via GitHub Wed, 28 Jan 2026 09:37:15 -0800


viiccwen commented on code in PR #918:
URL: https://github.com/apache/mahout/pull/918#discussion_r2737758593



##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -512,6 +677,66 @@ int launch_l2_norm_batch(
     return (int)cudaGetLastError();
 }
 
+/// Launch L2 norm reduction for a batch of vectors (float32).
+/// Writes inverse norms for each sample into `inv_norms_out_d`.
+int launch_l2_norm_batch_f32(
+    const float* input_batch_d,
+    size_t num_samples,
+    size_t sample_len,
+    float* inv_norms_out_d,
+    cudaStream_t stream
+) {
+    if (num_samples == 0 || sample_len == 0) {
+        return cudaErrorInvalidValue;
+    }
+
+    cudaError_t memset_status = cudaMemsetAsync(
+        inv_norms_out_d,
+        0,
+        num_samples * sizeof(float),
+        stream
+    );
+    if (memset_status != cudaSuccess) {
+        return memset_status;
+    }
+
+    const int blockSize = DEFAULT_BLOCK_SIZE;
+    const size_t elements_per_block = blockSize * 2; // float2 per thread
+    size_t blocks_per_sample = (sample_len + elements_per_block - 1) / 
elements_per_block;
+    const size_t max_blocks_per_sample = MAX_BLOCKS_PER_SAMPLE;
+    if (blocks_per_sample == 0) blocks_per_sample = 1;
+    if (blocks_per_sample > max_blocks_per_sample) {
+        blocks_per_sample = max_blocks_per_sample;
+    }
+
+    size_t gridSize = num_samples * blocks_per_sample;
+    const size_t max_grid = CUDA_MAX_GRID_DIM_1D; // CUDA grid dimension limit 
for 1D launch
+    if (gridSize > max_grid) {
+        blocks_per_sample = max_grid / num_samples;
+        if (blocks_per_sample == 0) {
+            blocks_per_sample = 1;
+        }
+        gridSize = num_samples * blocks_per_sample;

Review Comment:
   I think first suggestion is better.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] MAHOUT-802: Add float32 L2 norm reduction kernel for batch processing [mahout]

Reply via email to