Stuart Henderson <s...@spacehopper.org> wrote:
> On 2024/02/18 16:56, Evan Silberman wrote:
> > Something like this? I'm out of my depth and heavily pattern-matching
> > against the fix to simdutf and other references. Genuinely no idea if
> > I'm using inline asm correctly, etc. Works on my machine, however.
> 
> That seems right to me. I don't have an AVX512 machine handy though.
> Here's a combined diff. (No idea what happened with the distinfo that's
> in tree but let's normalise it while there).
> 
> The simdutf update is merged (https://github.com/haskell/text/pull/564),
> so we probably want to try a PR for your diff against
> https://github.com/haskell/text/blob/master/cbits/measure_off.c
> if everyone's here is happy with it.

Hi Stuart,

Thanks for putting the whole patch together. Having rebuilt the GHC
package with a patched text library, I can build a working pandoc and my
own code. So this all looks good to me.

I'll take my diff to measure_off.c upstream when I get a moment. It
won't actually trickle down to us until it gets bundled into a newer GHC
and then we adopt it so we may carry the patch for a while.

Evan


> 
> Index: Makefile
> ===================================================================
> RCS file: /cvs/ports/lang/ghc/Makefile,v
> retrieving revision 1.220
> diff -u -p -r1.220 Makefile
> --- Makefile  5 Feb 2024 01:49:50 -0000       1.220
> +++ Makefile  21 Feb 2024 12:35:13 -0000
> @@ -14,6 +14,7 @@ USE_NOEXECONLY =    Yes
>  USE_NOBTCFI =                Yes
>  
>  GHC_VERSION =                9.6.4
> +REVISION =           0
>  DISTNAME =           ghc-${GHC_VERSION}
>  CATEGORIES =         lang devel
>  HOMEPAGE =           https://www.haskell.org/ghc/
> Index: distinfo
> ===================================================================
> RCS file: /cvs/ports/lang/ghc/distinfo,v
> retrieving revision 1.73
> diff -u -p -r1.73 distinfo
> --- distinfo  5 Feb 2024 01:49:28 -0000       1.73
> +++ distinfo  21 Feb 2024 12:35:13 -0000
> @@ -1,10 +1,10 @@
> -SHA256 (ghc/ghc-9.6.4.20240111-amd64.tar.xz) = 
> CedJ29vBFZyl1e+DgcUqPfjHMDRKmEOsXP9gH4Wka6E=
> -SHA256 (ghc/ghc-9.6.4.20240111-shlibs-amd64.tar.gz) = 
> Nb3trqnIF8H5kfKEkeGLr+sl4rPeFsbW/gfkelRprrY=
>  SHA256 (ghc/ghc-9.6.4-src.tar.xz) = 
> EL8luLBxdP3ZhotcDFbBfA7x7ctiR7S4ZL6TNlG/1MA=
>  SHA256 (ghc/ghc-9.6.4-testsuite.tar.xz) = 
> bhMoL76//b+gpJiJQ3REyakM/ldgxHlpzUJFhUwzjXM=
> +SHA256 (ghc/ghc-9.6.4.20240111-amd64.tar.xz) = 
> CedJ29vBFZyl1e+DgcUqPfjHMDRKmEOsXP9gH4Wka6E=
> +SHA256 (ghc/ghc-9.6.4.20240111-shlibs-amd64.tar.gz) = 
> Nb3trqnIF8H5kfKEkeGLr+sl4rPeFsbW/gfkelRprrY=
>  SHA256 (ghc/hadrian-sources-9.6.4.20240111.tar.gz) = 
> wMMJfyP7Pr6xjb/tj9Kz5iZugGr6+duMwJ23aGsUWy0=
>  SIZE (ghc/ghc-9.6.4-src.tar.xz) = 29451856
>  SIZE (ghc/ghc-9.6.4-testsuite.tar.xz) = 7075820
>  SIZE (ghc/ghc-9.6.4.20240111-amd64.tar.xz) = 74706384
>  SIZE (ghc/ghc-9.6.4.20240111-shlibs-amd64.tar.gz) = 3544885
> -SIZE (ghc/hadrian-sources-9.6.4.20240111.tar.gz) = 2125322
> \ No newline at end of file
> +SIZE (ghc/hadrian-sources-9.6.4.20240111.tar.gz) = 2125322
> Index: patches/patch-libraries_text_cbits_measure_off_c
> ===================================================================
> RCS file: patches/patch-libraries_text_cbits_measure_off_c
> diff -N patches/patch-libraries_text_cbits_measure_off_c
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ patches/patch-libraries_text_cbits_measure_off_c  21 Feb 2024 12:35:13 
> -0000
> @@ -0,0 +1,23 @@
> +Don't attempt to use avx512 kernels when the OS doesn't support them
> +
> +Index: libraries/text/cbits/measure_off.c
> +--- libraries/text/cbits/measure_off.c.orig
> ++++ libraries/text/cbits/measure_off.c
> +@@ -44,12 +44,16 @@
> + bool has_avx512_vl_bw() {
> + #if (__GNUC__ >= 7 || __GNUC__ == 6 && __GNUC_MINOR__ >= 3) || 
> defined(__clang_major__)
> +   uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
> ++  uint64_t xcr0;
> +   __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
> +   // https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features
> ++  // __asm__("xgetbv\n\t" : "=a" (xcr0) : "c" (0));
> +   const bool has_avx512_bw = ebx & (1 << 30);
> +   const bool has_avx512_vl = ebx & (1 << 31);
> ++  // XCR0 bits 5, 6, and 7
> ++  const bool avx512_os_enabled = (xcr0 & 0xE0) == 0xE0;
> +   // printf("cpuid=%d=cpuid\n", has_avx512_bw && has_avx512_vl);
> +-  return has_avx512_bw && has_avx512_vl;
> ++  return has_avx512_bw && has_avx512_vl && avx512_os_enabled;
> + #else
> +   return false;
> + #endif
> Index: patches/patch-libraries_text_simdutf_simdutf_h
> ===================================================================
> RCS file: patches/patch-libraries_text_simdutf_simdutf_h
> diff -N patches/patch-libraries_text_simdutf_simdutf_h
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ patches/patch-libraries_text_simdutf_simdutf_h    21 Feb 2024 12:35:13 
> -0000
> @@ -0,0 +1,78 @@
> +https://github.com/simdutf/simdutf/commit/55b107f609f5f63880db650a92861ae84cb10abe
> +(haskell/text upstream has now updated to a version past this commit)
> +
> +Index: libraries/text/simdutf/simdutf.h
> +--- libraries/text/simdutf/simdutf.h.orig
> ++++ libraries/text/simdutf/simdutf.h
> +@@ -549,6 +549,7 @@ namespace cpuid_bit {
> +     // EAX = 0x01
> +     constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit  1 
> of ECX for EAX=0x1
> +     constexpr uint32_t sse42 = uint32_t(1) << 20;    ///< @private bit 20 
> of ECX for EAX=0x1
> ++    constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); 
> ///< @private bits 26+27 of ECX for EAX=0x1
> + 
> +     // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
> +     // See: "Table 3-8. Information Returned by CPUID Instruction"
> +@@ -574,6 +575,10 @@ namespace cpuid_bit {
> +     namespace edx {
> +       constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
> +     }
> ++    namespace xcr0_bit {
> ++      constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 
> 2 = AVX
> ++      constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private 
> bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
> ++    }
> +   }
> + }
> + 
> +@@ -583,7 +588,7 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx,
> +                          uint32_t *edx) {
> + #if defined(_MSC_VER)
> +   int cpu_info[4];
> +-  __cpuid(cpu_info, *eax);
> ++  __cpuidex(cpu_info, *eax, *ecx);
> +   *eax = cpu_info[0];
> +   *ebx = cpu_info[1];
> +   *ecx = cpu_info[2];
> +@@ -601,6 +606,16 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx,
> + #endif
> + }
> + 
> ++static inline uint64_t xgetbv() {
> ++#if defined(_MSC_VER)
> ++  return _xgetbv(0);
> ++#else
> ++  uint32_t xcr0_lo, xcr0_hi;
> ++  asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
> ++  return xcr0_lo | ((uint64_t)xcr0_hi << 32);
> ++#endif
> ++}
> ++
> + static inline uint32_t detect_supported_architectures() {
> +   uint32_t eax;
> +   uint32_t ebx = 0;
> +@@ -620,6 +635,16 @@ static inline uint32_t detect_supported_architectures(
> +     host_isa |= instruction_set::PCLMULQDQ;
> +   }
> + 
> ++  if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
> ++    return host_isa;
> ++  }
> ++
> ++  // xgetbv for checking if the OS saves registers
> ++  uint64_t xcr0 = xgetbv();
> ++
> ++  if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
> ++    return host_isa;
> ++  }
> +   // ECX for EAX=0x7
> +   eax = 0x7;
> +   ecx = 0x0; // Sub-leaf = 0
> +@@ -632,6 +657,9 @@ static inline uint32_t detect_supported_architectures(
> +   }
> +   if (ebx & cpuid_bit::ebx::bmi2) {
> +     host_isa |= instruction_set::BMI2;
> ++  }
> ++  if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == 
> cpuid_bit::xcr0_bit::avx512_saved)) {
> ++    return host_isa;
> +   }
> +   if (ebx & cpuid_bit::ebx::avx512f) {
> +     host_isa |= instruction_set::AVX512F;


Reply via email to