https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109811
--- Comment #5 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Also forgot to mention, I used zen3 machine. So Raptor lake is not necessary.
Note that build systems appends -O2 after any CFLAGS specified, so it really is
-O2 build:
# Force build with optimizations in release mode.
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
For Clang other options are appended:
-fnew-alignment=8
-fno-cxx-exceptions
-fno-slp-vectorize
-fno-vectorize
-disable-free
-disable-llvm-verifier
Perf profile mixing both GCC and clang build is:
8.36% cjxl libjxl.so.0.7.0 [.] jxl::(anonymous
namespace)::FindTextLikePatches
◆
5.74% cjxl libjxl.so.0.7.0 [.] jxl::FindBestPatchDictionary
▒
4.51% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::EstimateEntropy
▒
4.50% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::TransformFromPixels
▒
4.25% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::QuantizeBlockAC
▒
4.10% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::EstimateEntropy
▒
3.77% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::TransformFromPixels
▒
3.46% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::QuantizeBlockAC
▒
3.08% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::FindBestMultiplier
▒
3.04% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::FindBestMultiplier
▒
2.98% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<8ul, 8ul>::operator()
▒
2.80% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long>
const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*,
jxl::Plane<float>*)::{l▒
2.75% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long>
const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*,
jxl::Plane<float>*)::$_▒
2.26% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::SRGBToXYB(jxl::Image3<float> const&, float const*,
jxl::ThreadPool*, jxl::Image3<float>*)::$_0>::CallDataFunc
▒
2.00% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<4ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom,
jxl::N_AVX2::(anonymous namespace)::DCTTo>
▒
1.95% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<16ul, 8ul>::operator()
▒
1.68% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long,
unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long,
JxlEnd▒
1.68% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<32ul, 8ul>::operator()
▒
1.66% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long,
unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long,
JxlEnd▒
1.56% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<8ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom,
jxl::N_AVX2::(anonymous namespace)::DCTTo>
▒
1.52% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<32ul, 8ul>::operator()
▒
1.33% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::N_AVX2::(anonymous
namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&,
jxl::FrameDimensions const&, float, jxl::ThreadPool*, jxl::Plane<float>*)::$_0,
jxl::N_AVX2▒
1.27% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<64ul, 0ul, jxl::N_AVX2::(anonymous
namespace)::DCTFrom, jxl::N_AVX2::(anonymous namespace)::DCTTo>
▒
1.11% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::(anonymous
namespace)::FindTextLikePatches(jxl::Image3<float> const&,
jxl::PassesEncoderState const*, jxl::ThreadPool*, jxl::AuxOut*,
bool)::{lambda(un▒
So it is some hand written AVX code. In GCC top function is
FindTextLikePatches
while clang FindBestPatchDictionary. We do not inline it because of large
function growth limit. Adding --param large-function-insns=1000000 makes
inlining decisions to match and has no effect on the performance.
With these changes I get:
8.42% cjxl libjxl.so.0.7.0 [.] jxl::FindBestPatchDictionary
◆
5.72% cjxl libjxl.so.0.7.0 [.] jxl::FindBestPatchDictionary
▒
4.50% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::TransformFromPixels
▒
4.46% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::EstimateEntropy
▒
4.25% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::QuantizeBlockAC
▒
4.14% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::EstimateEntropy
▒
3.76% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::TransformFromPixels
▒
3.56% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::QuantizeBlockAC
▒
3.10% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::FindBestMultiplier
▒
3.00% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::FindBestMultiplier
▒
2.98% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<8ul, 8ul>::operator()
▒
2.82% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long>
const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*,
jxl::Plane<float>*)::{l▒
2.75% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long>
const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*,
jxl::Plane<float>*)::$_▒
2.26% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::SRGBToXYB(jxl::Image3<float> const&, float const*,
jxl::ThreadPool*, jxl::Image3<float>*)::$_0>::CallDataFunc
▒
1.99% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<4ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom,
jxl::N_AVX2::(anonymous namespace)::DCTTo>
▒
1.95% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<16ul, 8ul>::operator()
▒
1.69% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long,
unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long,
JxlEnd▒
1.67% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternal(jxl::Span<unsigned char const>, unsigned long,
unsigned long, jxl::ColorEncoding const&, unsigned long, bool, unsigned long,
JxlEnd▒
1.66% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<32ul, 8ul>::operator()
▒
1.54% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<8ul, 4ul, jxl::N_AVX2::(anonymous namespace)::DCTFrom,
jxl::N_AVX2::(anonymous namespace)::DCTTo>
▒
1.49% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DImpl<32ul, 8ul>::operator()
▒
1.34% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::N_AVX2::(anonymous
namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&,
jxl::FrameDimensions const&, float, jxl::ThreadPool*, jxl::Plane<float>*)::$_0,
jxl::N_AVX2▒
1.27% cjxl libjxl.so.0.7.0 [.] jxl::N_AVX2::(anonymous
namespace)::DCT1DWrapper<64ul, 0ul, jxl::N_AVX2::(anonymous
namespace)::DCTFrom, jxl::N_AVX2::(anonymous namespace)::DCTTo>
▒
1.16% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::(anonymous
namespace)::FindTextLikePatches(jxl::Image3<float> const&,
jxl::PassesEncoderState const*, jxl::ThreadPool*, jxl::AuxOut*,
bool)::{lambda(un▒
1.07% cjxl libjxl.so.0.7.0 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long), jxl::(anonymous
namespace)::FindTextLikePatches(jxl::Image3<float> const&,
jxl::PassesEncoderState const*, jxl::ThreadPool*, jxl::AuxOut*,
bool)::$_0>::Call▒