| Issue |
114891
|
| Summary |
Loop Vectorizer chooses small vectorization factor VF when known trip count isn't a multiple of it.
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
MatzeB
|
I am looking into vectorization for AVX512 in a case where the loop trip count isn't a multiple of the ideal VF of 16.
A simplified version of the problem looks like this[1]:
```
void foosum56(float *dest, const float *values) {
float buf[56];
memcpy(buf, dest, sizeof(buf));
// #pragma clang loop vectorize_width(16)
for (int i = 0; i < 56; i++) {
buf[i] += values[i];
}
memcpy(dest, buf, sizeof(buf));
}
```
Compiling with something like `clang -O3 -S -o - -mavx512f test.c` the loop vectorizer chooses a vectorization factor of 8 and the resuling code is using ymm registers:
```
...
vmovups (%rsi), %ymm0
vmovups 32(%rsi), %ymm1
vmovups 64(%rsi), %ymm2
vmovups 96(%rsi), %ymm3
vaddps (%rdi), %ymm0, %ymm0
vaddps 32(%rdi), %ymm1, %ymm1
vaddps 64(%rdi), %ymm2, %ymm2
vaddps 96(%rdi), %ymm3, %ymm3
vmovups 128(%rsi), %ymm4
vaddps 128(%rdi), %ymm4, %ymm4
vmovups 160(%rsi), %ymm5
vaddps 160(%rdi), %ymm5, %ymm5
vmovups 192(%rsi), %ymm6
vaddps 192(%rdi), %ymm6, %ymm6
vmovups %ymm0, (%rdi)
vmovups %ymm1, 32(%rdi)
vmovups %ymm2, 64(%rdi)
vmovups %ymm3, 96(%rdi)
vmovups %ymm4, 128(%rdi)
vmovups %ymm5, 160(%rdi)
vmovups %ymm6, 192(%rdi)
...
```
- Ideally though we would use zmm registers/operations for the first couple elements and a ymm register/operation for the remaining 8 elements.
- Manually adding `#pragma clang loop vectorize_width(16)` produces a poor result too: While it does nicely use zmm for the first elements, the remaining 8 elements are scalarized...
This still reproduces on LLVM trunk (on 8b55162e195783dd27e1c69fb4d97971ef76725b from Oct 29).
Filing this to document the issue while I am trying to figure out how this situation could be improved...
[1] Simplified llvm-ir:
```
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-redhat-linux-gnu"
define void @foosum120(ptr %dest, ptr readonly %values) #0 {
entry:
%buf = alloca [56 x float], align 16
call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(224) %buf, ptr noundef nonnull align 4 dereferenceable(224) %dest, i64 224, i1 false)
br label %for.body
for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, ptr %values, i64 %indvars.iv
%0 = load float, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds [56 x float], ptr %buf, i64 0, i64 %indvars.iv
%1 = load float, ptr %arrayidx2, align 4
%add = fadd float %0, %1
store float %add, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 56
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body ; , !llvm.loop !7
for.cond.cleanup:
call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(224) %dest, ptr noundef nonnull align 16 dereferenceable(224) %buf, i64 224, i1 false)
ret void
}
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2
attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-f
eatures"="+avx,+avx2,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
attributes #2 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) }
!7 = distinct !{!7, !8}
!8 = !{!"llvm.loop.vectorize.width", i32 16}
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs