[ 
https://issues.apache.org/jira/browse/MAHOUT-802?focusedWorklogId=1001690&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-1001690
 ]

ASF GitHub Bot logged work on MAHOUT-802:
-----------------------------------------

                Author: ASF GitHub Bot
            Created on: 24/Jan/26 15:04
            Start Date: 24/Jan/26 15:04
    Worklog Time Spent: 10m 
      Work Description: rich7420 commented on code in PR #918:
URL: https://github.com/apache/mahout/pull/918#discussion_r2724245973


##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -391,6 +455,46 @@ __global__ void l2_norm_batch_kernel(
     }
 }
 
+/// Kernel: accumulate L2 norms for a batch (float32).
+/// Grid is organized as (blocks_per_sample * num_samples) blocks.
+__global__ void l2_norm_batch_kernel_f32(
+    const float* __restrict__ input_batch,
+    size_t num_samples,
+    size_t sample_len,
+    size_t blocks_per_sample,
+    float* __restrict__ out_norms
+) {
+    const size_t sample_idx = blockIdx.x / blocks_per_sample;
+    if (sample_idx >= num_samples) return;
+
+    const size_t block_in_sample = blockIdx.x % blocks_per_sample;
+    const size_t base = sample_idx * sample_len;
+
+    const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
+    const size_t stride = blockDim.x * blocks_per_sample;
+
+    float local_sum = 0.0f;
+
+    size_t vec_offset = vec_idx;
+    size_t offset = vec_offset * 2;
+    while (offset + 1 < sample_len) {
+        const float2 v = __ldg(reinterpret_cast<const float2*>(input_batch + 
base) + vec_offset);
+        local_sum += v.x * v.x + v.y * v.y;

Review Comment:
   input_batch + base is a float*, and float2 loads require 8‑byte alignment. 
If sample_len is odd, then base = sample_idx * sample_len is odd, so the 
address is offset by 4 bytes and the reinterpret_cast<const float2*> load is 
misaligned. That maybe trigger misaligned memory access or force less efficient 
load paths on some GPUs.



##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -512,6 +677,66 @@ int launch_l2_norm_batch(
     return (int)cudaGetLastError();
 }
 
+/// Launch L2 norm reduction for a batch of vectors (float32).
+/// Writes inverse norms for each sample into `inv_norms_out_d`.
+int launch_l2_norm_batch_f32(
+    const float* input_batch_d,
+    size_t num_samples,
+    size_t sample_len,
+    float* inv_norms_out_d,
+    cudaStream_t stream
+) {
+    if (num_samples == 0 || sample_len == 0) {
+        return cudaErrorInvalidValue;
+    }
+
+    cudaError_t memset_status = cudaMemsetAsync(
+        inv_norms_out_d,
+        0,
+        num_samples * sizeof(float),
+        stream
+    );
+    if (memset_status != cudaSuccess) {
+        return memset_status;
+    }
+
+    const int blockSize = DEFAULT_BLOCK_SIZE;
+    const size_t elements_per_block = blockSize * 2; // float2 per thread
+    size_t blocks_per_sample = (sample_len + elements_per_block - 1) / 
elements_per_block;
+    const size_t max_blocks_per_sample = MAX_BLOCKS_PER_SAMPLE;
+    if (blocks_per_sample == 0) blocks_per_sample = 1;
+    if (blocks_per_sample > max_blocks_per_sample) {
+        blocks_per_sample = max_blocks_per_sample;
+    }
+
+    size_t gridSize = num_samples * blocks_per_sample;
+    const size_t max_grid = CUDA_MAX_GRID_DIM_1D; // CUDA grid dimension limit 
for 1D launch
+    if (gridSize > max_grid) {
+        blocks_per_sample = max_grid / num_samples;
+        if (blocks_per_sample == 0) {
+            blocks_per_sample = 1;
+        }
+        gridSize = num_samples * blocks_per_sample;

Review Comment:
   If num_samples exceeds the 1D grid limit, blocks_per_sample becomes 1 but 
gridSize = num_samples still exceeds max_grid, leading to invalid launch. I 
think we could add an explicit guard (return error) or switch to 2D grid / 
looped samples. WDYT?





Issue Time Tracking
-------------------

    Worklog Id:     (was: 1001690)
    Time Spent: 20m  (was: 10m)

> Start Phase doesn't properly work in RecommenderJob
> ---------------------------------------------------
>
>                 Key: MAHOUT-802
>                 URL: https://issues.apache.org/jira/browse/MAHOUT-802
>             Project: Mahout
>          Issue Type: Bug
>            Reporter: Grant Ingersoll
>            Assignee: Grant Ingersoll
>            Priority: Minor
>             Fix For: 0.6
>
>         Attachments: MAHOUT-802.patch, MAHOUT-802b.patch
>
>          Time Spent: 20m
>  Remaining Estimate: 0h
>
> I'm trying to run RecommenderJob and do --startPhase 2 since I have my prefs 
> already in the right format.  Unfortunately, when I do that, I get:
> {quote}
> java.lang.IllegalArgumentException: Number of columns was not correctly set!
>       at 
> com.google.common.base.Preconditions.checkArgument(Preconditions.java:88)
>       at 
> org.apache.mahout.math.hadoop.similarity.RowSimilarityJob$SimilarityReducer.setup(RowSimilarityJob.java:296)
>       at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:174)
>       at 
> org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:648)
>       at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:416)
>       at 
> org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:256)
> {quote}
> This appears to be due to the fact that the numberOfUsers variable defaults 
> to 0 and is only set when phase 1 is run.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to