Hello Guix,

I recently discovered that the FFTW library can do runtime cpu
detection.  In order to do this, the package needs to be configured to
build SIMD "codelets", like how our 'fftw-avx' currently does.  Then,
based on the instruction support detected at runtime, make those
kernels available to the fftw "planner" for execution.

I tested this on two systems: 1) system with sse2, and 2) system with
avx2.  I configured the library with "--enable-sse2 --enable-avx
--enable-avx2", then ran the following on both systems:

1)
$ ./tests/bench --verbose=3 --verify 'ibcd11x7x6v10'
Planning ibcd11x7x6v10...
using plan_many_dft
estimate-planner time: 0.004355 s
using plan_many_dft
planner time: 0.035684 s
(dft-rank>=2/1
  (dft-vrank>=1-x11/1
    (dft-rank>=2/1
      (dft-vrank>=1-x7/1
        (dft-direct-6-x10 "n1bv_6_sse2"))
      (dft-direct-7-x60 "n1bv_7_sse2")))
  (dft-direct-11-x420 "n1bv_11_sse2"))
flops: 36800 add, 9700 mul, 26260 fma
estimated cost: 99057.699080, pcost = 115706.000000
ibcd11x7x6v10 4.33362e-16 7.27264e-16 8.46842e-16

2)
$ ./tests/bench --verbose=3 --verify 'ibcd11x7x6v10'
Planning ibcd11x7x6v10...
using plan_many_dft
estimate-planner time: 0.001485 s
using plan_many_dft
planner time: 0.025788 s
(dft-rank>=2/1
  (dft-rank>=2/1
    (dft-vrank>=1-x77/1
      (dft-direct-6-x10 "n1bv_6_sse2"))
    (dft-vrank>=1-x11/1
      (dft-direct-7-x60 "n1bv_7_avx")))
  (dft-direct-11-x420 "n1bv_11_avx"))
flops: 12280 add, 2810 mul, 6950 fma
estimated cost: 28996.283180, pcost = 40767.000000
ibcd11x7x6v10 2.24601e-07 3.90447e-07 2.42548e-07


The attached patch is a WIP.

-- 
Eric Bavier, Scientific Libraries, Cray Inc.
diff --git a/gnu/packages/algebra.scm b/gnu/packages/algebra.scm
index 2aa1777db..96c78ea81 100644
--- a/gnu/packages/algebra.scm
+++ b/gnu/packages/algebra.scm
@@ -533,17 +533,26 @@ a C program.")
     (build-system gnu-build-system)
     (arguments
      '(#:configure-flags
-       '("--enable-shared" "--enable-openmp" "--enable-threads")
-       #:phases (alist-cons-before
-                 'build 'no-native
-                 (lambda _
-                   ;; By default '-mtune=native' is used.  However, that may
-                   ;; cause the use of ISA extensions (SSE2, etc.) that are
-                   ;; not necessarily available on the user's machine when
-                   ;; that package is built on a different machine.
-                   (substitute* (find-files "." "Makefile$")
-                     (("-mtune=native") "")))
-                 %standard-phases)))
+       `("--enable-shared" "--enable-openmp" "--enable-threads"
+         ,@(let ((system (or (%current-target-system) (%current-system))))
+             (cond
+              ((or (string-prefix? "x86_64" system)
+                   (string-prefix? "i686" system))
+               ;; Enable AVX & co. for codelets.  See details at:
+               ;; <http://fftw.org/fftw3_doc/Installation-on-Unix.html>.
+               '("--enable-avx" "--enable-avx2"
+                 "--enable-avx512" "--enable-avx-128-fma"))
+              ((string-prefix? "aarch64" system)
+               '("--enable-neon" "--enable-armv8-cntvct-el0"))
+              ((string-prefix? "armv7" system)
+               '("--enable-neon" "--enable-armv7a-cntvct"))
+              ((string-prefix? "mips" system)
+               '("--enable-mips-zbus-timer"))))
+         ;; By default '-mtune=native' is used.  However, that may cause the
+         ;; use of ISA extensions (e.g. AVX) that are not necessarily
+         ;; available on the user's machine when that package is built on a
+         ;; different machine.
+         "ax_cv_c_flags__mtune_native=no")))
     (native-inputs `(("perl" ,perl)))
     (home-page "http://fftw.org";)
     (synopsis "Computing the discrete Fourier transform")
@@ -560,7 +569,7 @@ cosine/ sine transforms or DCT/DST).")
     (arguments
      (substitute-keyword-arguments (package-arguments fftw)
        ((#:configure-flags cf)
-        `(cons "--enable-float" ,cf))))
+        `(cons  "--enable-float" "--enable-sse" ,cf))))
     (description
      (string-append (package-description fftw)
                     "  Single-precision version."))))
@@ -592,29 +601,6 @@ cosine/ sine transforms or DCT/DST).")
                (base32
                 "0wsms8narnbhfsa8chdflv2j9hzspvflblnqdn7hw8x5xdzrnq1v"))))))
 
-(define-public fftw-avx
-  (package
-    (inherit fftw-3.3.7)
-    (name "fftw-avx")
-    (arguments
-     (substitute-keyword-arguments (package-arguments fftw-3.3.7)
-       ((#:configure-flags flags ''())
-        ;; Enable AVX & co.  See details at:
-        ;; <http://fftw.org/fftw3_doc/Installation-on-Unix.html>.
-        `(append '("--enable-avx" "--enable-avx2" "--enable-avx512"
-                   "--enable-avx-128-fma")
-                 ,flags))
-       ((#:substitutable? _ #f)
-        ;; To run the tests, we must have a CPU that supports all these
-        ;; extensions.  Since we cannot be sure that machines in the build
-        ;; farm support them, disable substitutes altogether.
-        #f)
-       ((#:phases _)
-        ;; Since we're not providing binaries, let '-mtune=native' through.
-        '%standard-phases)))
-    (synopsis "Computing the discrete Fourier transform (AVX2-optimized)")
-    (supported-systems '("x86_64-linux"))))
-
 (define-public java-la4j
   (package
     (name "java-la4j")

Reply via email to