[
https://issues.apache.org/jira/browse/MAHOUT-802?focusedWorklogId=1001690&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-1001690
]
ASF GitHub Bot logged work on MAHOUT-802:
-----------------------------------------
Author: ASF GitHub Bot
Created on: 24/Jan/26 15:04
Start Date: 24/Jan/26 15:04
Worklog Time Spent: 10m
Work Description: rich7420 commented on code in PR #918:
URL: https://github.com/apache/mahout/pull/918#discussion_r2724245973
##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -391,6 +455,46 @@ __global__ void l2_norm_batch_kernel(
}
}
+/// Kernel: accumulate L2 norms for a batch (float32).
+/// Grid is organized as (blocks_per_sample * num_samples) blocks.
+__global__ void l2_norm_batch_kernel_f32(
+ const float* __restrict__ input_batch,
+ size_t num_samples,
+ size_t sample_len,
+ size_t blocks_per_sample,
+ float* __restrict__ out_norms
+) {
+ const size_t sample_idx = blockIdx.x / blocks_per_sample;
+ if (sample_idx >= num_samples) return;
+
+ const size_t block_in_sample = blockIdx.x % blocks_per_sample;
+ const size_t base = sample_idx * sample_len;
+
+ const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
+ const size_t stride = blockDim.x * blocks_per_sample;
+
+ float local_sum = 0.0f;
+
+ size_t vec_offset = vec_idx;
+ size_t offset = vec_offset * 2;
+ while (offset + 1 < sample_len) {
+ const float2 v = __ldg(reinterpret_cast<const float2*>(input_batch +
base) + vec_offset);
+ local_sum += v.x * v.x + v.y * v.y;
Review Comment:
input_batch + base is a float*, and float2 loads require 8‑byte alignment.
If sample_len is odd, then base = sample_idx * sample_len is odd, so the
address is offset by 4 bytes and the reinterpret_cast<const float2*> load is
misaligned. That maybe trigger misaligned memory access or force less efficient
load paths on some GPUs.
##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -512,6 +677,66 @@ int launch_l2_norm_batch(
return (int)cudaGetLastError();
}
+/// Launch L2 norm reduction for a batch of vectors (float32).
+/// Writes inverse norms for each sample into `inv_norms_out_d`.
+int launch_l2_norm_batch_f32(
+ const float* input_batch_d,
+ size_t num_samples,
+ size_t sample_len,
+ float* inv_norms_out_d,
+ cudaStream_t stream
+) {
+ if (num_samples == 0 || sample_len == 0) {
+ return cudaErrorInvalidValue;
+ }
+
+ cudaError_t memset_status = cudaMemsetAsync(
+ inv_norms_out_d,
+ 0,
+ num_samples * sizeof(float),
+ stream
+ );
+ if (memset_status != cudaSuccess) {
+ return memset_status;
+ }
+
+ const int blockSize = DEFAULT_BLOCK_SIZE;
+ const size_t elements_per_block = blockSize * 2; // float2 per thread
+ size_t blocks_per_sample = (sample_len + elements_per_block - 1) /
elements_per_block;
+ const size_t max_blocks_per_sample = MAX_BLOCKS_PER_SAMPLE;
+ if (blocks_per_sample == 0) blocks_per_sample = 1;
+ if (blocks_per_sample > max_blocks_per_sample) {
+ blocks_per_sample = max_blocks_per_sample;
+ }
+
+ size_t gridSize = num_samples * blocks_per_sample;
+ const size_t max_grid = CUDA_MAX_GRID_DIM_1D; // CUDA grid dimension limit
for 1D launch
+ if (gridSize > max_grid) {
+ blocks_per_sample = max_grid / num_samples;
+ if (blocks_per_sample == 0) {
+ blocks_per_sample = 1;
+ }
+ gridSize = num_samples * blocks_per_sample;
Review Comment:
If num_samples exceeds the 1D grid limit, blocks_per_sample becomes 1 but
gridSize = num_samples still exceeds max_grid, leading to invalid launch. I
think we could add an explicit guard (return error) or switch to 2D grid /
looped samples. WDYT?
Issue Time Tracking
-------------------
Worklog Id: (was: 1001690)
Time Spent: 20m (was: 10m)
> Start Phase doesn't properly work in RecommenderJob
> ---------------------------------------------------
>
> Key: MAHOUT-802
> URL: https://issues.apache.org/jira/browse/MAHOUT-802
> Project: Mahout
> Issue Type: Bug
> Reporter: Grant Ingersoll
> Assignee: Grant Ingersoll
> Priority: Minor
> Fix For: 0.6
>
> Attachments: MAHOUT-802.patch, MAHOUT-802b.patch
>
> Time Spent: 20m
> Remaining Estimate: 0h
>
> I'm trying to run RecommenderJob and do --startPhase 2 since I have my prefs
> already in the right format. Unfortunately, when I do that, I get:
> {quote}
> java.lang.IllegalArgumentException: Number of columns was not correctly set!
> at
> com.google.common.base.Preconditions.checkArgument(Preconditions.java:88)
> at
> org.apache.mahout.math.hadoop.similarity.RowSimilarityJob$SimilarityReducer.setup(RowSimilarityJob.java:296)
> at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:174)
> at
> org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:648)
> at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:416)
> at
> org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:256)
> {quote}
> This appears to be due to the fact that the numberOfUsers variable defaults
> to 0 and is only set when phase 1 is run.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)