https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109811
--- Comment #6 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
hottest loop in clang's profile is:
for (size_t y = 0; y < opsin.ysize(); y++) {
for (size_t x = 0; x < opsin.xsize(); x++) {
if (is_background_row[y * is_background_stride + x]) continue;
cc.clear();
stack.clear();
stack.emplace_back(x, y);
size_t min_x = x;
size_t max_x = x;
size_t min_y = y;
size_t max_y = y;
std::pair<uint32_t, uint32_t> reference;
bool found_border = false;
bool all_similar = true;
while (!stack.empty()) {
std::pair<uint32_t, uint32_t> cur = stack.back();
stack.pop_back();
if (visited_row[cur.second * visited_stride + cur.first]) continue;
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
closed by this continue.
visited_row[cur.second * visited_stride + cur.first] = 1;
if (cur.first < min_x) min_x = cur.first;
if (cur.first > max_x) max_x = cur.first;
if (cur.second < min_y) min_y = cur.second;
if (cur.second > max_y) max_y = cur.second;
if (paint_ccs) {
cc.push_back(cur);
}
for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) {
for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) {
if (dx == 0 && dy == 0) continue;
int next_first = static_cast<int32_t>(cur.first) + dx;
int next_second = static_cast<int32_t>(cur.second) + dy;
if (next_first < 0 || next_second < 0 ||
static_cast<uint32_t>(next_first) >= opsin.xsize() ||
static_cast<uint32_t>(next_second) >= opsin.ysize()) {
continue;
}
std::pair<uint32_t, uint32_t> next{next_first, next_second};
if (!is_background_row[next.second * is_background_stride +
next.first]) {
stack.push_back(next);
} else {
if (!found_border) {
reference = next;
found_border = true;
} else {
if (!is_similar_b(next, reference)) all_similar = false;
}
}
}
}
}
if (!found_border || !all_similar || max_x - min_x >= kMaxPatchSize ||
max_y - min_y >= kMaxPatchSize) {
continue;
}
size_t bpos = background_stride * reference.second + reference.first;
float ref[3] = {background_rows[0][bpos], background_rows[1][bpos],
background_rows[2][bpos]};
bool has_similar = false;
for (size_t iy = std::max<int>(
static_cast<int32_t>(min_y) - kHasSimilarRadius, 0);
iy < std::min(max_y + kHasSimilarRadius + 1, opsin.ysize()); iy++) {
for (size_t ix = std::max<int>(
static_cast<int32_t>(min_x) - kHasSimilarRadius, 0);
ix < std::min(max_x + kHasSimilarRadius + 1, opsin.xsize());
ix++) {
size_t opos = opsin_stride * iy + ix;
float px[3] = {opsin_rows[0][opos], opsin_rows[1][opos],
opsin_rows[2][opos]};
if (pci.is_similar_v(ref, px, kHasSimilarThreshold)) {
has_similar = true;
}
}
}
if (!has_similar) continue;
info.emplace_back();
info.back().second.emplace_back(min_x, min_y);
QuantizedPatch& patch = info.back().first;
patch.xsize = max_x - min_x + 1;
patch.ysize = max_y - min_y + 1;
int max_value = 0;
for (size_t c : {1, 0, 2}) {
for (size_t iy = min_y; iy <= max_y; iy++) {
for (size_t ix = min_x; ix <= max_x; ix++) {
size_t offset = (iy - min_y) * patch.xsize + ix - min_x;
patch.fpixels[c][offset] =
opsin_rows[c][iy * opsin_stride + ix] - ref[c];
int val = pci.Quantize(patch.fpixels[c][offset], c);
patch.pixels[c][offset] = val;
if (std::abs(val) > max_value) max_value = std::abs(val);
}
}
}
if (max_value < kMinPeak) {
info.pop_back();
continue;
}
if (paint_ccs) {
float cc_color = rng.UniformF(0.5, 1.0);
for (std::pair<uint32_t, uint32_t> p : cc) {
ccs.Row(p.second)[p.first] = cc_color;
}
}
}
}
I guess such a large loop nest with hottest loop not being the innermost is bad
for register pressure.
Clangs code is :
0.02 │1196:┌─→cmp %r10,-0xb8(%rbp)
▒
│ │jxl::FindBestPatchDictionary(jxl::Image3<float> const&,
jxl::PassesEncoderState*, JxlCmsInterface co▒
│ │while (!stack.empty()) {
◆
1.39 │ │↓ je 1690
▒
│ │std::pair<uint32_t, uint32_t> cur = stack.back();
▒
│11a3:│ mov -0x8(%r10),%rbx
▒
9.80 │ │ mov -0x230(%rbp),%r14
▒
│ │__gnu_cxx::__normal_iterator<std::pair<unsigned int, unsigned
int>*, std::vector<std::pair<unsigned ▒
│ │{ return __normal_iterator(_M_current - __n); }
▒
0.86 │ │ add $0xfffffffffffffff8,%r10
▒
│ │jxl::FindBestPatchDictionary(jxl::Image3<float> const&,
jxl::PassesEncoderState*, JxlCmsInterface co▒
0.36 │ │ mov %rbx,%r13
▒
0.01 │ │ shr $0x20,%r13
▒
│ │if (visited_row[cur.second * visited_stride + cur.first])
continue; ▒
2.69 │ │ mov %ebx,%eax
▒
0.00 │ │ mov %r13,%rcx
▒
1.12 │ │ imul -0x180(%rbp),%rcx
▒
0.78 │ │ add %rax,%rcx
▒
2.73 │ ├──cmpb $0x0,(%r14,%rcx,1)
▒
19.91 │ └──jne 1196
▒
While GCC is longer:
Percent│ Percent│ while (!stack.empty()) {
▒
│1241:┌─→cmp %rax,-0x370(%rbp)
▒
0.70 │ │↓ je 1481
▒
│ │std::pair<uint32_t, uint32_t> cur = stack.back();
▒
0.02 │124e:│ mov -0x8(%rax),%rdx
▒
│ │std::vector<std::pair<unsigned int, unsigned int>,
std::allocator<std::pair<unsigned int, unsigned int> > >::pop_back▒
│ │--this->_M_impl._M_finish;
▒
0.63 │ │ sub $0x8,%rax
▒
0.74 │ │ mov %rax,-0x368(%rbp)
▒
│ │FindTextLikePatches():
▒
0.01 │ │ mov %edx,%esi
◆
0.35 │ │ mov %rdx,%rdi
▒
0.82 │ │ mov %rdx,-0x490(%rbp)
▒
│ │if (visited_row[cur.second * visited_stride + cur.first])
continue; ▒
0.00 │ │ mov -0x5f8(%rbp),%rdx
▒
│ │std::pair<uint32_t, uint32_t> cur = stack.back();
▒
1.32 │ │ shr $0x20,%rdi
▒
0.11 │ │ mov %rsi,%r15
▒
0.09 │ │ mov %edi,%ecx
▒
│ │if (visited_row[cur.second * visited_stride + cur.first])
continue; ▒
0.72 │ │ mov -0x5f0(%rbp),%rdi
▒
│ │std::pair<uint32_t, uint32_t> cur = stack.back();
▒
0.03 │ │ mov %rcx,%r12
▒
│ │if (visited_row[cur.second * visited_stride + cur.first])
continue; ▒
0.39 │ │ imul %rcx,%rdx
▒
1.00 │ │ add %rsi,%rdx
▒
1.35 │ │ add %rdi,%rdx
▒
1.37 │ ├──cmpb $0x0,(%rdx)
▒
9.56 │ └──jne 1241
▒ while (!stack.empty()) {
▒
│1241:┌─→cmp %rax,-0x370(%rbp)
▒
0.70 │ │↓ je 1481
▒
│ │std::pair<uint32_t, uint32_t> cur = stack.back();
▒
0.02 │124e:│ mov -0x8(%rax),%rdx
▒
│ │std::vector<std::pair<unsigned int, unsigned int>,
std::allocator<std::pair<unsigned int, unsigned int> > >::pop_back▒
│ │--this->_M_impl._M_finish;
▒
0.63 │ │ sub $0x8,%rax
▒
0.74 │ │ mov %rax,-0x368(%rbp)
▒
│ │FindTextLikePatches():
▒
0.01 │ │ mov %edx,%esi
◆
0.35 │ │ mov %rdx,%rdi
▒
0.82 │ │ mov %rdx,-0x490(%rbp)
▒
│ │if (visited_row[cur.second * visited_stride + cur.first])
continue; ▒
0.00 │ │ mov -0x5f8(%rbp),%rdx
▒
│ │std::pair<uint32_t, uint32_t> cur = stack.back();
▒
1.32 │ │ shr $0x20,%rdi
▒
0.11 │ │ mov %rsi,%r15
▒
0.09 │ │ mov %edi,%ecx
▒
│ │if (visited_row[cur.second * visited_stride + cur.first])
continue; ▒
0.72 │ │ mov -0x5f0(%rbp),%rdi
▒
│ │std::pair<uint32_t, uint32_t> cur = stack.back();
▒
0.03 │ │ mov %rcx,%r12
▒
│ │if (visited_row[cur.second * visited_stride + cur.first])
continue; ▒
0.39 │ │ imul %rcx,%rdx
▒
1.00 │ │ add %rsi,%rdx
▒
1.35 │ │ add %rdi,%rdx
▒
1.37 │ ├──cmpb $0x0,(%rdx)
▒
9.56 │ └──jne 1241
▒
Moreover we have heuristics saying that continue usually closes loops with low
trip count.
-fno-ivopts improves performance with --num_threads=0 however makes number of
instructions worse. Curiously enought with --num_threads=1 and more ivopts is
benefical.
