Issue 173398
Summary Clang++ HUGE misoptimisation if `constexpr` added to the function
Labels clang
Assignees
Reporter socketpair
    ```cpp
#include <cstddef>
#include <cstdint>
#include <ctime>
#include <iostream>
#include <utility>
#include <stdexcept>
#include <time.h>
#include <vector>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>

namespace
{

struct NodeType
{
    uint64_t leaves_bitmap;
    uint64_t children_bitmap;
    uint64_t leaves_base;
 uint64_t children_base;
};

uint32_t ntoh(uint32_t a)
{
    if constexpr (std::endian::native == std::endian::big) {
        return a;
    } else {
        return std::byteswap(a);
    }
}

auto black_box(uint32_t value)
{
    asm volatile("" : "+r"(value));
    return value;
}
}

[[gnu::noinline]] constexpr uint32_t findLPM(uint32_t addr, const unsigned char* m_array)
{
    const NodeType* node = reinterpret_cast<const NodeType*>(m_array);
    addr = ntoh(addr);
 constexpr auto m_total_addr_bits = sizeof(uint32_t) * 8;
    constexpr std::size_t STRIDE = 6;

#if defined(__clang__)
#pragma clang loop unroll_count(m_total_addr_bits / STRIDE + 1)
#elif defined(__GNUC__)
#pragma GCC unroll 0
#endif
    for (std::size_t depth = 0; depth < (m_total_addr_bits + STRIDE); depth += STRIDE) {
        const auto bit = uint64_t { 1 } << ((addr >> (m_total_addr_bits - STRIDE - depth)) & ((uint32_t { 1 } << STRIDE) - 1));
        if (node->children_bitmap & bit) {
            node = &reinterpret_cast<const NodeType*>(m_array)[node->children_base + std::popcount(node->children_bitmap & (bit - 1))];
        } else {
            return reinterpret_cast<const uint32_t*>(m_array)[node->leaves_base + std::popcount(node->leaves_bitmap & (bit | (bit - 1)))];
        }
    }
    std::unreachable();
}

int main()
{
    int fd_ = ::open("compressed.bin", O_RDONLY);
    if (fd_ == -1)
        throw std::system_error(errno, std::system_category(), "open failed");
    struct stat st {};
    if (::fstat(fd_, &st) == -1)
 throw std::system_error(errno, std::system_category(), "fstat failed");

 const unsigned char* data_ = (const unsigned char*) ::mmap(nullptr, st.st_size, PROT_READ, MAP_PRIVATE, fd_, 0);
    if (data_ == MAP_FAILED)
 throw std::runtime_error("mmap failed");
    ::close(fd_);

 constexpr uint32_t ip = 925653069; // conv("77.88.44.55");
    constexpr size_t iterations = 500000000ull;
    struct timespec ts1, ts2;

    if (clock_gettime(CLOCK_MONOTONIC, &ts1) == -1)
        throw std::system_error(errno, std::system_category(), "clock_gettime()");

 uint32_t out;
    for (std::size_t i = 0; i < iterations; i++) {
 out = findLPM(black_box(ip), data_);
    }
    if (clock_gettime(CLOCK_MONOTONIC, &ts2) == -1)
        throw std::system_error(errno, std::system_category(), "clock_gettime()");

 auto diff_nsec = (ts2.tv_sec * 1000000000ull + ts2.tv_nsec) - (ts1.tv_sec * 1000000000ull + ts1.tv_nsec);
    std::cout << "Speed (Mlp/sec): " << iterations * 1000ull / diff_nsec << std::endl;
    std::cout << "Just to tell compiler the result is used: " << out << std::endl;
}
```

> `[[gnu::noinline]] constexpr uint32_t findLPM(uint32_t addr, const unsigned char* m_array)`

removing `constexpr` here *OR* placing the function into anonymous namespace fixes the problem.


how to trigger?

```
#!/bin/bash

set -e -u -x

g++     bug.cpp  -std=c++23 -march=native -Wall -Wextra -O3 -o as_gcc --save-temps
clang++ bug.cpp -std=c++23 -march=native -Wall -Wextra -O3 -o as_clang --save-temps

./as_gcc
./as_clang
```

gives:

```
+ g++ bug.cpp -std=c++23 -march=native -Wall -Wextra -O3 -o as_gcc --save-temps
+ clang++ bug.cpp -std=c++23 -march=native -Wall -Wextra -O3 -o as_clang --save-temps
+ ./as_gcc
Speed (Mlp/sec): 7923
Just to tell compiler the result is used: 42
+ ./as_clang
Speed (Mlp/sec): 166
Just to tell compiler the result is used: 42
```

Significant performance drop! but if I change sources as explained before:

```
+ g++ bug.cpp -std=c++23 -march=native -Wall -Wextra -O3 -o as_gcc --save-temps
+ clang++ bug.cpp -std=c++23 -march=native -Wall -Wextra -O3 -o as_clang --save-temps
+ ./as_gcc
Speed (Mlp/sec): 7837
Just to tell compiler the result is used: 42
+ ./as_clang
Speed (Mlp/sec): 3968
Just to tell compiler the result is used: 42
```

```
$ LANG=C g++ --version
g++ (GCC) 15.2.1 20251211 (Red Hat 15.2.1-5)

$ LANG=C clang++ --version
clang version 21.1.7 (Fedora 21.1.7-1.fc43)
Target: x86_64-redhat-linux-gnu
```

In order to run you heed `compressed.bin`. Attaching it here in .zip file.

[a.zip](https://github.com/user-attachments/files/24316011/a.zip)
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to