On 21/04/2021 19:13, Assaf Gordon wrote:
Hello,

On 2021-03-29 7:21 a.m., Pádraig Brady wrote:

On 28/03/2021 18:29, Kristoffer Brånemyr via GNU coreutils General
I wanted to practice some more using vector intrinsics, so I made a
small AVX2 optimization for wc -l. Depending on line length it is
about 2-5x faster than previous version. (Well, only looking at user
time it is much faster than that even.)

Excellent results.
I'll review this very soon.


I'm attaching the patch (copied from the Github's pull-request),
hopefully we can continue the discussion here on the mailing list.

I plan to push the attached 2 commits tomorrow.
The first adjusts the original patch to pass `make syntax-check`.
Also I noticed an inconsistency in the new wc_lines() function,
between BUFFER_SIZE and BUFSIZ, and changed uses to the former.
The second commit, adds a --debug option to indicate
the now runtime variable behavior or which implementation is used.

cheers,
Pádraig
>From 07057dbcd61ab4cbda4bef110bb30c70f4d7f22f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristoffer=20Br=C3=A5nemyr?= <zti...@yahoo.se>
Date: Sat, 20 Feb 2021 12:27:17 +0100
Subject: [PATCH 1/2] wc: use avx2 optimization when counting only lines

Use cpuid to detect CPU support for avx2 instructions.
Performance was seen to improve by 5x for a file with only newlines,
while the performance for a file with no such characters is unchanged.

* configure.ac [USE_AVX2_WC_LINECOUNT]: A new conditional,
set when __get_cpuid_count() and avx2 compiler intrinsics are supported.
* src/wc.c (avx2_supported): A new function using __get_cpuid_count()
to determine if avx2 instructions are supported.
(wc_lines): A new function refactored from wc(),
which implements the standard line counting logic,
and provides the fallback implementation for when avx2 is not supported.
* src/wc_avx2.c: A new module to implement using avx2 intrinsics.
* src/local.mk: Reference the new module.  Note we build as a separate
lib so that it can be portably built with separate -mavx2 etc. flags.
---
 configure.ac  |  49 ++++++++++++++++
 src/local.mk  |   9 +++
 src/wc.c      | 157 ++++++++++++++++++++++++++++++++++++--------------
 src/wc_avx2.c | 122 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 294 insertions(+), 43 deletions(-)
 create mode 100644 src/wc_avx2.c

diff --git a/configure.ac b/configure.ac
index 02291a4ae..f0fbbd9b7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -575,6 +575,55 @@ AM_CONDITIONAL([USE_PCLMUL_CRC32],
                 test "x$pclmul_intrinsic_exists" = "xyes"])
 CFLAGS=$ac_save_CFLAGS
 
+AC_MSG_CHECKING([if __get_cpuid_count exists])
+AC_COMPILE_IFELSE(
+  [AC_LANG_SOURCE([[
+    #include <cpuid.h>
+
+    int main(void)
+    {
+      unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
+      __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+      return 1;
+    }
+  ]])
+  ],[
+    AC_MSG_RESULT([yes])
+    get_cpuid_count_exists=yes
+  ],[
+    AC_MSG_RESULT([no])
+  ])
+
+CFLAGS="-mavx2 $CFLAGS"
+AC_MSG_CHECKING([if avx2 intrinstics exists])
+AC_COMPILE_IFELSE(
+  [AC_LANG_SOURCE([[
+    #include <x86intrin.h>
+
+    int main(void)
+    {
+      __m256i a, b;
+      a = _mm256_sad_epu8(a, b);
+      return 1;
+    }
+  ]])
+  ],[
+    AC_MSG_RESULT([yes])
+    AC_DEFINE([HAVE_AVX2_INTRINSIC], [1], [avx2 intrinsics exists])
+    avx2_intrinsic_exists=yes
+  ],[
+    AC_MSG_RESULT([no])
+  ])
+if test "x$get_cpuid_count_exists" = "xyes" &&
+   test "x$avx2_intrinsic_exists" = "xyes"; then
+  AC_DEFINE([USE_AVX2_WC_LINECOUNT], [1], [Counting lines with AVX2 enabled])
+fi
+AM_CONDITIONAL([USE_AVX2_WC_LINECOUNT],
+               [test "x$get_cpuid_count_exists" = "xyes" &&
+                test "x$avx2_intrinsic_exists" = "xyes"])
+
+CFLAGS=$ac_save_CFLAGS
+
 ############################################################################
 
 dnl Autogenerated by the 'gen-lists-of-programs.sh' auxiliary script.
diff --git a/src/local.mk b/src/local.mk
index 8c8479a53..c6555dafb 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -427,6 +427,15 @@ src_basenc_CPPFLAGS = -DBASE_TYPE=42 $(AM_CPPFLAGS)
 src_expand_SOURCES = src/expand.c src/expand-common.c
 src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
 
+src_wc_SOURCES = src/wc.c
+if USE_AVX2_WC_LINECOUNT
+noinst_LIBRARIES += src/libwc_avx2.a
+src_libwc_avx2_a_SOURCES = src/wc_avx2.c
+wc_avx2_ldadd = src/libwc_avx2.a
+src_wc_LDADD += $(wc_avx2_ldadd)
+src_libwc_avx2_a_CFLAGS = -mavx2 $(AM_CFLAGS)
+endif
+
 # Ensure we don't link against libcoreutils.a as that lib is
 # not compiled with -fPIC which causes issues on 64 bit at least
 src_libstdbuf_so_LDADD = $(LIBINTL)
diff --git a/src/wc.c b/src/wc.c
index d635e5214..35a865719 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -37,6 +37,9 @@
 #include "safe-read.h"
 #include "stat-size.h"
 #include "xbinary-io.h"
+#ifdef USE_AVX2_WC_LINECOUNT
+# include <cpuid.h>
+#endif
 
 #if !defined iswspace && !HAVE_ISWSPACE
 # define iswspace(wc) \
@@ -53,6 +56,20 @@
 /* Size of atomic reads. */
 #define BUFFER_SIZE (16 * 1024)
 
+static bool
+wc_lines (char const *file, int fd, uintmax_t *lines_out,
+          uintmax_t *bytes_out);
+#ifdef USE_AVX2_WC_LINECOUNT
+/* From wc_avx2.c */
+extern bool
+wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
+               uintmax_t *bytes_out);
+#endif
+static bool
+(*wc_lines_p) (char const *file, int fd, uintmax_t *lines_out,
+                uintmax_t *bytes_out) = wc_lines;
+
+
 /* Cumulative number of lines, words, chars and bytes in all files so far.
    max_line_length is the maximum over all files processed so far.  */
 static uintmax_t total_lines;
@@ -108,6 +125,33 @@ static struct option const longopts[] =
   {NULL, 0, NULL, 0}
 };
 
+#ifdef USE_AVX2_WC_LINECOUNT
+static bool
+avx2_supported (void)
+{
+  unsigned int eax = 0;
+  unsigned int ebx = 0;
+  unsigned int ecx = 0;
+  unsigned int edx = 0;
+
+  if (! __get_cpuid (1, &eax, &ebx, &ecx, &edx))
+    return false;
+
+  if (! (ecx & bit_OSXSAVE))
+    return false;
+
+  eax = ebx = ecx = edx = 0;
+
+  if (! __get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return false;
+
+  if (! (ebx & bit_AVX2))
+    return false;
+
+  return true;
+}
+#endif
+
 void
 usage (int status)
 {
@@ -208,6 +252,70 @@ write_counts (uintmax_t lines,
   putchar ('\n');
 }
 
+static bool
+wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
+{
+  size_t bytes_read;
+  uintmax_t lines, bytes;
+  char buf[BUFFER_SIZE + 1];
+  bool long_lines = false;
+
+  if (!lines_out || !bytes_out)
+    {
+      return false;
+    }
+
+  lines = bytes = 0;
+
+  while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
+    {
+
+      if (bytes_read == SAFE_READ_ERROR)
+        {
+          error (0, errno, "%s", quotef (file));
+          return false;
+        }
+
+      bytes += bytes_read;
+
+      char *p = buf;
+      char *end = buf + bytes_read;
+      uintmax_t plines = lines;
+
+      if (! long_lines)
+        {
+          /* Avoid function call overhead for shorter lines.  */
+          while (p != end)
+            lines += *p++ == '\n';
+        }
+      else
+        {
+          /* memchr is more efficient with longer lines.  */
+          while ((p = memchr (p, '\n', end - p)))
+            {
+              ++p;
+              ++lines;
+            }
+        }
+
+      /* If the average line length in the block is >= 15, then use
+          memchr for the next block, where system specific optimizations
+          may outweigh function call overhead.
+          FIXME: This line length was determined in 2015, on both
+          x86_64 and ppc64, but it's worth re-evaluating in future with
+          newer compilers, CPUs, or memchr() implementations etc.  */
+      if (lines - plines <= bytes_read / 15)
+        long_lines = true;
+      else
+        long_lines = false;
+    }
+
+  *bytes_out = bytes;
+  *lines_out = lines;
+
+  return true;
+}
+
 /* Count words.  FILE_X is the name of the file (or NULL for standard
    input) that is open on descriptor FD.  *FSTATUS is its status.
    CURRENT_POS is the current file offset if known, negative if unknown.
@@ -312,49 +420,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
     {
       /* Use a separate loop when counting only lines or lines and bytes --
          but not chars or words.  */
-      bool long_lines = false;
-      while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
-        {
-          if (bytes_read == SAFE_READ_ERROR)
-            {
-              error (0, errno, "%s", quotef (file));
-              ok = false;
-              break;
-            }
-
-          bytes += bytes_read;
-
-          char *p = buf;
-          char *end = p + bytes_read;
-          uintmax_t plines = lines;
-
-          if (! long_lines)
-            {
-              /* Avoid function call overhead for shorter lines.  */
-              while (p != end)
-                lines += *p++ == '\n';
-            }
-          else
-            {
-              /* memchr is more efficient with longer lines.  */
-              while ((p = memchr (p, '\n', end - p)))
-                {
-                  ++p;
-                  ++lines;
-                }
-            }
-
-          /* If the average line length in the block is >= 15, then use
-             memchr for the next block, where system specific optimizations
-             may outweigh function call overhead.
-             FIXME: This line length was determined in 2015, on both
-             x86_64 and ppc64, but it's worth re-evaluating in future with
-             newer compilers, CPUs, or memchr() implementations etc.  */
-          if (lines - plines <= bytes_read / 15)
-            long_lines = true;
-          else
-            long_lines = false;
-        }
+      ok = wc_lines_p (file, fd, &lines, &bytes);
     }
 #if MB_LEN_MAX > 1
 # define SUPPORT_OLD_MBRTOWC 1
@@ -706,6 +772,11 @@ main (int argc, char **argv)
   print_linelength = false;
   total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
 
+#ifdef USE_AVX2_WC_LINECOUNT
+  if (avx2_supported ())
+    wc_lines_p = wc_lines_avx2;
+#endif
+
   while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1)
     switch (optc)
       {
diff --git a/src/wc_avx2.c b/src/wc_avx2.c
new file mode 100644
index 000000000..634c1bbb0
--- /dev/null
+++ b/src/wc_avx2.c
@@ -0,0 +1,122 @@
+/* wc_avx - Count the number of newlines with avx2 instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include "system.h"
+#include "error.h"
+#include "safe-read.h"
+
+#include <x86intrin.h>
+
+/* This must be below 16 KB (16384) or else the accumulators can
+   theoretically overflow, producing wrong result. This is 2*32 bytes below,
+   so there is no single bytes in the optimal case. */
+#define BUFSIZE (16320)
+
+extern bool
+wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
+               uintmax_t *bytes_out);
+
+extern bool
+wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
+               uintmax_t *bytes_out)
+{
+  __m256i accumulator;
+  __m256i accumulator2;
+  __m256i zeroes;
+  __m256i endlines;
+  __m256i avx_buf[BUFSIZE / sizeof (__m256i)];
+  __m256i *datap;
+  uintmax_t lines = 0;
+  uintmax_t bytes = 0;
+  size_t bytes_read = 0;
+
+
+  if (!lines_out || !bytes_out)
+    return false;
+
+  /* Using two parallel accumulators gave a good performance increase.
+     Adding a third gave no additional benefit, at least on an
+     Intel Xeon E3-1231v3.  Maybe on a newer CPU with additional vector
+     execution engines it would be a win. */
+  accumulator = _mm256_setzero_si256 ();
+  accumulator2 = _mm256_setzero_si256 ();
+  zeroes = _mm256_setzero_si256 ();
+  endlines = _mm256_set1_epi8 ('\n');
+
+  while ((bytes_read = safe_read (fd, avx_buf, sizeof (avx_buf))) > 0)
+    {
+      __m256i to_match;
+      __m256i to_match2;
+      __m256i matches;
+      __m256i matches2;
+
+      if (bytes_read == SAFE_READ_ERROR)
+        {
+          error (0, errno, "%s", quotef (file));
+          return false;
+        }
+
+      bytes += bytes_read;
+
+      datap = avx_buf;
+      char *end = ((char *)avx_buf) + bytes_read;
+
+      while (bytes_read >= 64)
+        {
+          to_match = _mm256_load_si256 (datap);
+          to_match2 = _mm256_load_si256 (datap + 1);
+
+          matches = _mm256_cmpeq_epi8 (to_match, endlines);
+          matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
+          /* Compare will set each 8 bit integer in the register to 0xFF
+             on match.  When we subtract it the 8 bit accumulators
+             will underflow, so this is equal to adding 1. */
+          accumulator = _mm256_sub_epi8 (accumulator, matches);
+          accumulator2 = _mm256_sub_epi8 (accumulator2, matches2);
+
+          datap += 2;
+          bytes_read -= 64;
+        }
+
+      /* Horizontally add all 8 bit integers in the register,
+         and then reset it */
+      accumulator = _mm256_sad_epu8 (accumulator, zeroes);
+      lines +=   _mm256_extract_epi16 (accumulator, 0)
+               + _mm256_extract_epi16 (accumulator, 4)
+               + _mm256_extract_epi16 (accumulator, 8)
+               + _mm256_extract_epi16 (accumulator, 12);
+      accumulator = _mm256_setzero_si256 ();
+
+      accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes);
+      lines +=   _mm256_extract_epi16 (accumulator2, 0)
+               + _mm256_extract_epi16 (accumulator2, 4)
+               + _mm256_extract_epi16 (accumulator2, 8)
+               + _mm256_extract_epi16 (accumulator2, 12);
+      accumulator2 = _mm256_setzero_si256 ();
+
+      /* Finish up any left over bytes */
+      char *p = (char *)datap;
+      while (p != end)
+        lines += *p++ == '\n';
+    }
+
+  *lines_out = lines;
+  *bytes_out = bytes;
+
+  return true;
+}
-- 
2.26.2


>From 498a707873a33a99086fb38f0bfd821dd9795ed7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <p...@draigbrady.com>
Date: Sat, 1 May 2021 20:02:02 +0100
Subject: [PATCH 2/2] wc: add --debug to diagnose which implementation used

* src/wc.c: (main): Handle the new --debug option.
Only call avx2_supported if needed.
(avx2_supported): Diagnose various failures and attempts.
* NEWS: Mention the new wc improvement and --debug option.
---
 NEWS     |  4 ++++
 src/wc.c | 67 ++++++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/NEWS b/NEWS
index 090fbc728..beb34bba5 100644
--- a/NEWS
+++ b/NEWS
@@ -96,6 +96,10 @@ GNU coreutils NEWS                                    -*- outline -*-
 
   timeout now supports sub-second timeouts on macOS.
 
+  wc is up to 5 times faster when counting only new line characters,
+  where avx2 instructions are supported.
+  A new --debug option will indicate if avx2 is being used.
+
 
 * Noteworthy changes in release 8.32 (2020-03-05) [stable]
 
diff --git a/src/wc.c b/src/wc.c
index 35a865719..bdb51928d 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -69,6 +69,7 @@ static bool
 (*wc_lines_p) (char const *file, int fd, uintmax_t *lines_out,
                 uintmax_t *bytes_out) = wc_lines;
 
+static bool debug;
 
 /* Cumulative number of lines, words, chars and bytes in all files so far.
    max_line_length is the maximum over all files processed so far.  */
@@ -109,7 +110,8 @@ struct fstatus
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
 {
-  FILES0_FROM_OPTION = CHAR_MAX + 1
+  DEBUG_PROGRAM_OPTION = CHAR_MAX + 1,
+  FILES0_FROM_OPTION,
 };
 
 static struct option const longopts[] =
@@ -118,6 +120,7 @@ static struct option const longopts[] =
   {"chars", no_argument, NULL, 'm'},
   {"lines", no_argument, NULL, 'l'},
   {"words", no_argument, NULL, 'w'},
+  {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
   {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
   {"max-line-length", no_argument, NULL, 'L'},
   {GETOPT_HELP_OPTION_DECL},
@@ -133,22 +136,48 @@ avx2_supported (void)
   unsigned int ebx = 0;
   unsigned int ecx = 0;
   unsigned int edx = 0;
+  bool getcpuid_ok = false;
+  bool avx_enabled = false;
 
-  if (! __get_cpuid (1, &eax, &ebx, &ecx, &edx))
-    return false;
-
-  if (! (ecx & bit_OSXSAVE))
-    return false;
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx))
+    {
+      getcpuid_ok = true;
+      if (ecx & bit_OSXSAVE)
+        avx_enabled = true;  /* Support is not disabled.  */
+    }
 
-  eax = ebx = ecx = edx = 0;
 
-  if (! __get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
-    return false;
+  if (avx_enabled)
+    {
+      eax = ebx = ecx = edx = 0;
+      if (! __get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+        getcpuid_ok = false;
+      else
+        {
+          if (! (ebx & bit_AVX2))
+            avx_enabled = false;  /* Hardware doesn't support it.  */
+        }
+    }
 
-  if (! (ebx & bit_AVX2))
-    return false;
 
-  return true;
+  if (! getcpuid_ok)
+    {
+      if (debug)
+        error (0, 0, "%s", _("failed to get cpuid"));
+      return false;
+    }
+  else if (! avx_enabled)
+    {
+      if (debug)
+        error (0, 0, "%s", _("avx2 support not detected"));
+      return false;
+    }
+  else
+    {
+      if (debug)
+        error (0, 0, "%s", _("using avx2 hardware support"));
+      return true;
+    }
 }
 #endif
 
@@ -418,6 +447,11 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
     }
   else if (!count_chars && !count_complicated)
     {
+#ifdef USE_AVX2_WC_LINECOUNT
+      if (avx2_supported ())
+        wc_lines_p = wc_lines_avx2;
+#endif
+
       /* Use a separate loop when counting only lines or lines and bytes --
          but not chars or words.  */
       ok = wc_lines_p (file, fd, &lines, &bytes);
@@ -772,11 +806,6 @@ main (int argc, char **argv)
   print_linelength = false;
   total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
 
-#ifdef USE_AVX2_WC_LINECOUNT
-  if (avx2_supported ())
-    wc_lines_p = wc_lines_avx2;
-#endif
-
   while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1)
     switch (optc)
       {
@@ -800,6 +829,10 @@ main (int argc, char **argv)
         print_linelength = true;
         break;
 
+      case DEBUG_PROGRAM_OPTION:
+        debug = true;
+        break;
+
       case FILES0_FROM_OPTION:
         files_from = optarg;
         break;
-- 
2.26.2

Reply via email to