Pádraig Brady <[email protected]> writes:

> On 18/02/2026 08:16, Collin Funk wrote:
>> I will have to add support for Neon in the 'cpu-supports' Gnulib
>> module and update tests/wc/wc-cpu.sh before pushing this.
>> However, that should be fairly trivial. So I thought it was best to
>> send this for others to review anyways.
>
> Very nice. 3x is a nice win.

I pushed the attached patch with the Gnulib update, plus the missing
documentation and test fix.

I chose to use the name "neon" in the messages, as my impression is that
name is much more commonly used than "asimd" (Advanced SIMD). For the
environment variable though, I think we should stick to the HWCAP_*
macros, which unfortunately use ASIMD.

Collin

>From 6a3dde5dd2d77fa1aa5fac65d8f71bdad81471a1 Mon Sep 17 00:00:00 2001
Message-ID: <6a3dde5dd2d77fa1aa5fac65d8f71bdad81471a1.1771474640.git.collin.fu...@gmail.com>
From: Collin Funk <[email protected]>
Date: Wed, 18 Feb 2026 00:03:05 -0800
Subject: [PATCH] wc: add aarch64 Neon optimization for wc -l

Here is an example of the performance improvement:

    $ yes abcdefghijklmnopqrstuvwxyz | head -n 100000000 > input
    $ time ./src/wc-prev -l < input
    100000000

    real	0m0.793s
    user	0m0.630s
    sys	0m0.162s
    $ time ./src/wc -l < input
    100000000

    real	0m0.230s
    user	0m0.065s
    sys	0m0.164s

* NEWS: Mention the performance improvement.
* gnulib: Update to the latest commit.
* configure.ac: Check the the necessary intrinsics and functions.
* src/local.mk (noinst_LIBRARIES) [USE_NEON_WC_LINECOUNT]: Add
src/libwc_neon.a.
(src_libwc_neon_a_SOURCES, wc_neon_ldadd, src_libwc_neon_a_CFLAGS)
[USE_NEON_WC_LINECOUNT]: New variables.
(src_wc_LDADD) [USE_NEON_WC_LINECOUNT]: Add $(wc_neon_ldadd).
* src/wc.c [USE_NEON_WC_LINECOUNT]: Include sys/auxv.h and asm/hwcap.h.
(neon_supported) [USE_NEON_WC_LINECOUNT]: New function.
(wc_lines) [USE_NEON_WC_LINECOUNT]: Use neon_supported and
wc_lines_neon.
* src/wc.h (wc_lines_neon): Add declaration.
* src/wc_neon.c: New file.
* doc/coreutils.texi (Hardware Acceleration): Document the "-ASIMD"
hwcap and the variable used in ./configure to override detection of Neon
instructions.
* tests/wc/wc-cpu.sh: Also add "-ASIMD" to disable the use of Neon
instructions.
---
 NEWS               |   3 ++
 configure.ac       |  37 +++++++++++++++
 doc/coreutils.texi |   3 +-
 gnulib             |   2 +-
 src/local.mk       |   7 +++
 src/wc.c           |  27 +++++++++++
 src/wc.h           |   1 +
 src/wc_neon.c      | 109 +++++++++++++++++++++++++++++++++++++++++++++
 tests/wc/wc-cpu.sh |   4 +-
 9 files changed, 189 insertions(+), 4 deletions(-)
 create mode 100644 src/wc_neon.c

diff --git a/NEWS b/NEWS
index 179deca90..c509cbc1e 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,9 @@ GNU coreutils NEWS                                    -*- outline -*-
 
   'nl' now supports multi-byte --section-delimiter characters.
 
+  'wc -l' now operates up to three times faster on hosts that support Neon
+  instructions.
+
 ** Build-related
 
   ./configure --enable-single-binary=hardlinks is now supported on systems
diff --git a/configure.ac b/configure.ac
index 2da648987..fdf8d067f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -807,6 +807,43 @@ fi
 AM_CONDITIONAL([USE_AVX512_WC_LINECOUNT],
                [test $utils_cv_avx512_intrinsic_exists = yes])
 
+CFLAGS=$ac_save_CFLAGS
+
+CFLAGS="-march=armv8-a+simd $CFLAGS"
+AC_MSG_CHECKING([for neon intrinsics])
+AC_CACHE_VAL([utils_cv_neon_intrinsic_exists],[
+AC_LINK_IFELSE(
+  [AC_LANG_SOURCE([[
+    #include <sys/auxv.h>
+    #include <asm/hwcap.h>
+    #include <arm_neon.h>
+
+    int
+    main (void)
+    {
+      char buffer[128] = {0};
+      uint8x16_t v = vld1q_u8 (buffer);
+      uint8x16_t m = vceqq_u8 (v, v);
+      uint8x16_t s = vandq_u8 (m, m);
+      uint16x8_t a = vpaddlq_u8 (s);
+      uint32x4_t b = vpaddlq_u16 (a);
+      uint64x2_t c = vpaddlq_u32 (b);
+      int value = vgetq_lane_u64 (c, 0) + vgetq_lane_u64 (c, 1);
+      return value && 0 < (getauxval (AT_HWCAP) & HWCAP_ASIMD);
+    }
+  ]])
+  ],[
+    utils_cv_neon_intrinsic_exists=yes
+  ],[
+    utils_cv_neon_intrinsic_exists=no
+  ])])
+AC_MSG_RESULT([$utils_cv_neon_intrinsic_exists])
+if test $utils_cv_neon_intrinsic_exists = yes; then
+  AC_DEFINE([USE_NEON_WC_LINECOUNT], [1], [Counting lines with Neon enabled])
+fi
+AM_CONDITIONAL([USE_NEON_WC_LINECOUNT],
+               [test $utils_cv_neon_intrinsic_exists = yes])
+
 CFLAGS=$ac_save_CFLAGS
 ############################################################################
 
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 455930746..def1a8820 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -1599,6 +1599,7 @@ @node Hardware Acceleration
 @item utils_cv_avx2_intrinsic_exists
 @item utils_cv_avx2_pclmul_intrinsic_exists
 @item utils_cv_avx512_pclmul_intrinsic_exists
+@item utils_cv_neon_intrinsic_exists
 @item utils_cv_pclmul_intrinsic_exists
 @item utils_cv_vmull_intrinsic_exists
 @end table
@@ -1623,7 +1624,7 @@ @node Hardware Acceleration
 @example
 export OPENSSL_ia32cap='0x0'
 export OPENSSL_armcap='0x0'
-export GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512F,-AVX2,-AVX,-PMULL'
+export GLIBC_TUNABLES='glibc.cpu.hwcaps=-ASIMD,-AVX512F,-AVX2,-AVX,-PMULL'
 @end example
 
 The @option{--debug} option is available on all utilities supporting
diff --git a/gnulib b/gnulib
index a5a25302c..08e222ff7 160000
--- a/gnulib
+++ b/gnulib
@@ -1 +1 @@
-Subproject commit a5a25302cc8d20832f08c828f1d2de0961607e0f
+Subproject commit 08e222ff788896d872ff6526d755d869786bbee7
diff --git a/src/local.mk b/src/local.mk
index 29f07f254..bf88f7d0e 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -509,6 +509,13 @@ wc_avx2_ldadd = src/libwc_avx2.a
 src_wc_LDADD += $(wc_avx2_ldadd)
 src_libwc_avx2_a_CFLAGS = -mavx2 $(AM_CFLAGS)
 endif
+if USE_NEON_WC_LINECOUNT
+noinst_LIBRARIES += src/libwc_neon.a
+src_libwc_neon_a_SOURCES = src/wc_neon.c
+wc_neon_ldadd = src/libwc_neon.a
+src_wc_LDADD += $(wc_neon_ldadd)
+src_libwc_neon_a_CFLAGS = -march=armv8-a+simd $(AM_CFLAGS)
+endif
 
 # Ensure we don't link against libcoreutils.a as that lib is
 # not compiled with -fPIC which causes issues on 64 bit at least
diff --git a/src/wc.c b/src/wc.c
index 76f8d21a4..eb6c3d9e4 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -31,6 +31,11 @@
 #include <stat-size.h>
 #include <xbinary-io.h>
 
+#ifdef USE_NEON_WC_LINECOUNT
+# include <sys/auxv.h>
+# include <asm/hwcap.h>
+#endif
+
 #include "system.h"
 #include "cpu-supports.h"
 #include "ioblksize.h"
@@ -159,6 +164,21 @@ avx512_supported (void)
 }
 #endif
 
+#ifdef USE_NEON_WC_LINECOUNT
+static bool
+neon_supported (void)
+{
+  bool neon_enabled = (cpu_may_support ("asimd")
+                       && 0 < (getauxval (AT_HWCAP) & HWCAP_ASIMD));
+  if (debug)
+    error (0, 0, (neon_enabled
+                  ? _("using neon hardware support")
+                  : _("neon support not detected")));
+
+  return neon_enabled;
+}
+#endif
+
 void
 usage (int status)
 {
@@ -298,6 +318,13 @@ wc_lines (int fd)
   if (0 < use_avx2)
     return wc_lines_avx2 (fd);
 #endif
+#ifdef USE_NEON_WC_LINECOUNT
+  static signed char use_neon;
+  if (!use_neon)
+    use_neon = neon_supported () ? 1 : -1;
+  if (0 < use_neon)
+    return wc_lines_neon (fd);
+#endif
 
   intmax_t lines = 0, bytes = 0;
   bool long_lines = false;
diff --git a/src/wc.h b/src/wc.h
index f151e92f2..fa0aef990 100644
--- a/src/wc.h
+++ b/src/wc.h
@@ -2,3 +2,4 @@
 struct wc_lines { int err; intmax_t lines; intmax_t bytes; };
 struct wc_lines wc_lines_avx2 (int);
 struct wc_lines wc_lines_avx512 (int);
+struct wc_lines wc_lines_neon (int fd);
diff --git a/src/wc_neon.c b/src/wc_neon.c
new file mode 100644
index 000000000..53f82b8b4
--- /dev/null
+++ b/src/wc_neon.c
@@ -0,0 +1,109 @@
+/* wc_neon - Count the number of newlines with neon instructions.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Collin Funk <[email protected]>, 2026.  */
+
+#include <config.h>
+
+#include "wc.h"
+#include "system.h"
+#include "ioblksize.h"
+
+#include <arm_neon.h>
+
+/* Read FD and return a summary.  */
+extern struct wc_lines
+wc_lines_neon (int fd)
+{
+  intmax_t lines = 0;
+  intmax_t bytes = 0;
+
+  uint8x16_t endlines = vdupq_n_u8 ('\n');
+  uint8x16_t ones = vdupq_n_u8 (1);
+
+  while (true)
+    {
+      unsigned char neon_buf[IO_BUFSIZE];
+      ssize_t bytes_read = read (fd, neon_buf, sizeof neon_buf);
+      if (bytes_read <= 0)
+        return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
+
+      bytes += bytes_read;
+      unsigned char *datap = neon_buf;
+
+      while (64 <= bytes_read)
+        {
+          /* Load 64 bytes from NEON_BUF.  */
+          uint8x16_t v0 = vld1q_u8 (datap);
+          uint8x16_t v1 = vld1q_u8 (datap + 16);
+          uint8x16_t v2 = vld1q_u8 (datap + 32);
+          uint8x16_t v3 = vld1q_u8 (datap + 48);
+
+          /* Bitwise equal with ENDLINES.  */
+          uint8x16_t m0 = vceqq_u8 (v0, endlines);
+          uint8x16_t m1 = vceqq_u8 (v1, endlines);
+          uint8x16_t m2 = vceqq_u8 (v2, endlines);
+          uint8x16_t m3 = vceqq_u8 (v3, endlines);
+
+          /* Bitwise and with ONES.  */
+          uint8x16_t s0 = vandq_u8 (m0, ones);
+          uint8x16_t s1 = vandq_u8 (m1, ones);
+          uint8x16_t s2 = vandq_u8 (m2, ones);
+          uint8x16_t s3 = vandq_u8 (m3, ones);
+
+          /* Sum the vectors.  */
+          uint16x8_t a0 = vpaddlq_u8 (s0);
+          uint16x8_t a1 = vpaddlq_u8 (s1);
+          uint16x8_t a2 = vpaddlq_u8 (s2);
+          uint16x8_t a3 = vpaddlq_u8 (s3);
+          uint32x4_t b0 = vpaddlq_u16 (a0);
+          uint32x4_t b1 = vpaddlq_u16 (a1);
+          uint32x4_t b2 = vpaddlq_u16 (a2);
+          uint32x4_t b3 = vpaddlq_u16 (a3);
+          uint64x2_t c0 = vpaddlq_u32 (b0);
+          uint64x2_t c1 = vpaddlq_u32 (b1);
+          uint64x2_t c2 = vpaddlq_u32 (b2);
+          uint64x2_t c3 = vpaddlq_u32 (b3);
+
+          /* Extract the vectors.  */
+          lines += (vgetq_lane_u64 (c0, 0) + vgetq_lane_u64 (c0, 1)
+                    + vgetq_lane_u64 (c1, 0) + vgetq_lane_u64 (c1, 1)
+                    + vgetq_lane_u64 (c2, 0) + vgetq_lane_u64 (c2, 1)
+                    + vgetq_lane_u64 (c3, 0) + vgetq_lane_u64 (c3, 1));
+
+          datap += 64;
+          bytes_read -= 64;
+        }
+
+      while (16 <= bytes_read)
+        {
+          uint8x16_t v = vld1q_u8 (datap);
+          uint8x16_t m = vceqq_u8 (v, endlines);
+          uint8x16_t s = vandq_u8 (m, ones);
+          uint16x8_t a = vpaddlq_u8 (s);
+          uint32x4_t b = vpaddlq_u16 (a);
+          uint64x2_t c = vpaddlq_u32 (b);
+          lines += vgetq_lane_u64 (c, 0) + vgetq_lane_u64 (c, 1);
+          datap += 16;
+          bytes_read -= 16;
+        }
+
+      /* Finish up any left over bytes.  */
+      unsigned char *end = (unsigned char *) datap + bytes_read;
+      for (unsigned char *p = (unsigned char *) datap; p < end; p++)
+        lines += *p == '\n';
+    }
+}
diff --git a/tests/wc/wc-cpu.sh b/tests/wc/wc-cpu.sh
index 13366f59c..8aed6f808 100755
--- a/tests/wc/wc-cpu.sh
+++ b/tests/wc/wc-cpu.sh
@@ -19,7 +19,7 @@
 . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
 print_ver_ wc
 
-GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2,-AVX512F' \
+GLIBC_TUNABLES='glibc.cpu.hwcaps=-ASIMD,-AVX2,-AVX512F' \
  wc -l --debug /dev/null 2>debug || fail=1
 grep 'using.*hardware support' debug && fail=1
 
@@ -32,7 +32,7 @@ wc_accelerated_no_avx512=$(
           wc -l < lines
          ) || fail=1
 wc_base=$(
-          GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2,-AVX512F' \
+          GLIBC_TUNABLES='glibc.cpu.hwcaps=-ASIMD,-AVX2,-AVX512F' \
           wc -l < lines
          ) || fail=1
 
-- 
2.53.0

Reply via email to