Stuart Henderson <s...@spacehopper.org> wrote: > On 2024/02/18 16:56, Evan Silberman wrote: > > Something like this? I'm out of my depth and heavily pattern-matching > > against the fix to simdutf and other references. Genuinely no idea if > > I'm using inline asm correctly, etc. Works on my machine, however. > > That seems right to me. I don't have an AVX512 machine handy though. > Here's a combined diff. (No idea what happened with the distinfo that's > in tree but let's normalise it while there). > > The simdutf update is merged (https://github.com/haskell/text/pull/564), > so we probably want to try a PR for your diff against > https://github.com/haskell/text/blob/master/cbits/measure_off.c > if everyone's here is happy with it.
Hi Stuart, Thanks for putting the whole patch together. Having rebuilt the GHC package with a patched text library, I can build a working pandoc and my own code. So this all looks good to me. I'll take my diff to measure_off.c upstream when I get a moment. It won't actually trickle down to us until it gets bundled into a newer GHC and then we adopt it so we may carry the patch for a while. Evan > > Index: Makefile > =================================================================== > RCS file: /cvs/ports/lang/ghc/Makefile,v > retrieving revision 1.220 > diff -u -p -r1.220 Makefile > --- Makefile 5 Feb 2024 01:49:50 -0000 1.220 > +++ Makefile 21 Feb 2024 12:35:13 -0000 > @@ -14,6 +14,7 @@ USE_NOEXECONLY = Yes > USE_NOBTCFI = Yes > > GHC_VERSION = 9.6.4 > +REVISION = 0 > DISTNAME = ghc-${GHC_VERSION} > CATEGORIES = lang devel > HOMEPAGE = https://www.haskell.org/ghc/ > Index: distinfo > =================================================================== > RCS file: /cvs/ports/lang/ghc/distinfo,v > retrieving revision 1.73 > diff -u -p -r1.73 distinfo > --- distinfo 5 Feb 2024 01:49:28 -0000 1.73 > +++ distinfo 21 Feb 2024 12:35:13 -0000 > @@ -1,10 +1,10 @@ > -SHA256 (ghc/ghc-9.6.4.20240111-amd64.tar.xz) = > CedJ29vBFZyl1e+DgcUqPfjHMDRKmEOsXP9gH4Wka6E= > -SHA256 (ghc/ghc-9.6.4.20240111-shlibs-amd64.tar.gz) = > Nb3trqnIF8H5kfKEkeGLr+sl4rPeFsbW/gfkelRprrY= > SHA256 (ghc/ghc-9.6.4-src.tar.xz) = > EL8luLBxdP3ZhotcDFbBfA7x7ctiR7S4ZL6TNlG/1MA= > SHA256 (ghc/ghc-9.6.4-testsuite.tar.xz) = > bhMoL76//b+gpJiJQ3REyakM/ldgxHlpzUJFhUwzjXM= > +SHA256 (ghc/ghc-9.6.4.20240111-amd64.tar.xz) = > CedJ29vBFZyl1e+DgcUqPfjHMDRKmEOsXP9gH4Wka6E= > +SHA256 (ghc/ghc-9.6.4.20240111-shlibs-amd64.tar.gz) = > Nb3trqnIF8H5kfKEkeGLr+sl4rPeFsbW/gfkelRprrY= > SHA256 (ghc/hadrian-sources-9.6.4.20240111.tar.gz) = > wMMJfyP7Pr6xjb/tj9Kz5iZugGr6+duMwJ23aGsUWy0= > SIZE (ghc/ghc-9.6.4-src.tar.xz) = 29451856 > SIZE (ghc/ghc-9.6.4-testsuite.tar.xz) = 7075820 > SIZE (ghc/ghc-9.6.4.20240111-amd64.tar.xz) = 74706384 > SIZE (ghc/ghc-9.6.4.20240111-shlibs-amd64.tar.gz) = 3544885 > -SIZE (ghc/hadrian-sources-9.6.4.20240111.tar.gz) = 2125322 > \ No newline at end of file > +SIZE (ghc/hadrian-sources-9.6.4.20240111.tar.gz) = 2125322 > Index: patches/patch-libraries_text_cbits_measure_off_c > =================================================================== > RCS file: patches/patch-libraries_text_cbits_measure_off_c > diff -N patches/patch-libraries_text_cbits_measure_off_c > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-libraries_text_cbits_measure_off_c 21 Feb 2024 12:35:13 > -0000 > @@ -0,0 +1,23 @@ > +Don't attempt to use avx512 kernels when the OS doesn't support them > + > +Index: libraries/text/cbits/measure_off.c > +--- libraries/text/cbits/measure_off.c.orig > ++++ libraries/text/cbits/measure_off.c > +@@ -44,12 +44,16 @@ > + bool has_avx512_vl_bw() { > + #if (__GNUC__ >= 7 || __GNUC__ == 6 && __GNUC_MINOR__ >= 3) || > defined(__clang_major__) > + uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; > ++ uint64_t xcr0; > + __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); > + // https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features > ++ // __asm__("xgetbv\n\t" : "=a" (xcr0) : "c" (0)); > + const bool has_avx512_bw = ebx & (1 << 30); > + const bool has_avx512_vl = ebx & (1 << 31); > ++ // XCR0 bits 5, 6, and 7 > ++ const bool avx512_os_enabled = (xcr0 & 0xE0) == 0xE0; > + // printf("cpuid=%d=cpuid\n", has_avx512_bw && has_avx512_vl); > +- return has_avx512_bw && has_avx512_vl; > ++ return has_avx512_bw && has_avx512_vl && avx512_os_enabled; > + #else > + return false; > + #endif > Index: patches/patch-libraries_text_simdutf_simdutf_h > =================================================================== > RCS file: patches/patch-libraries_text_simdutf_simdutf_h > diff -N patches/patch-libraries_text_simdutf_simdutf_h > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-libraries_text_simdutf_simdutf_h 21 Feb 2024 12:35:13 > -0000 > @@ -0,0 +1,78 @@ > +https://github.com/simdutf/simdutf/commit/55b107f609f5f63880db650a92861ae84cb10abe > +(haskell/text upstream has now updated to a version past this commit) > + > +Index: libraries/text/simdutf/simdutf.h > +--- libraries/text/simdutf/simdutf.h.orig > ++++ libraries/text/simdutf/simdutf.h > +@@ -549,6 +549,7 @@ namespace cpuid_bit { > + // EAX = 0x01 > + constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit 1 > of ECX for EAX=0x1 > + constexpr uint32_t sse42 = uint32_t(1) << 20; ///< @private bit 20 > of ECX for EAX=0x1 > ++ constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); > ///< @private bits 26+27 of ECX for EAX=0x1 > + > + // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf) > + // See: "Table 3-8. Information Returned by CPUID Instruction" > +@@ -574,6 +575,10 @@ namespace cpuid_bit { > + namespace edx { > + constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8; > + } > ++ namespace xcr0_bit { > ++ constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit > 2 = AVX > ++ constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private > bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM > ++ } > + } > + } > + > +@@ -583,7 +588,7 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, > + uint32_t *edx) { > + #if defined(_MSC_VER) > + int cpu_info[4]; > +- __cpuid(cpu_info, *eax); > ++ __cpuidex(cpu_info, *eax, *ecx); > + *eax = cpu_info[0]; > + *ebx = cpu_info[1]; > + *ecx = cpu_info[2]; > +@@ -601,6 +606,16 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, > + #endif > + } > + > ++static inline uint64_t xgetbv() { > ++#if defined(_MSC_VER) > ++ return _xgetbv(0); > ++#else > ++ uint32_t xcr0_lo, xcr0_hi; > ++ asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0)); > ++ return xcr0_lo | ((uint64_t)xcr0_hi << 32); > ++#endif > ++} > ++ > + static inline uint32_t detect_supported_architectures() { > + uint32_t eax; > + uint32_t ebx = 0; > +@@ -620,6 +635,16 @@ static inline uint32_t detect_supported_architectures( > + host_isa |= instruction_set::PCLMULQDQ; > + } > + > ++ if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) { > ++ return host_isa; > ++ } > ++ > ++ // xgetbv for checking if the OS saves registers > ++ uint64_t xcr0 = xgetbv(); > ++ > ++ if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) { > ++ return host_isa; > ++ } > + // ECX for EAX=0x7 > + eax = 0x7; > + ecx = 0x0; // Sub-leaf = 0 > +@@ -632,6 +657,9 @@ static inline uint32_t detect_supported_architectures( > + } > + if (ebx & cpuid_bit::ebx::bmi2) { > + host_isa |= instruction_set::BMI2; > ++ } > ++ if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == > cpuid_bit::xcr0_bit::avx512_saved)) { > ++ return host_isa; > + } > + if (ebx & cpuid_bit::ebx::avx512f) { > + host_isa |= instruction_set::AVX512F;