Pádraig Brady <[email protected]> writes:

> On 04/04/2026 03:27, Collin Funk wrote:
>> Attatched a v4 patch.
>
> Since cat is such a fundamental tool, I think it's worth being extra defensive
> by trying the read()/write() loop after the splice.
> I've heard of cases where some fuse file systems lie about the size in stat,
> while read() provides all the data.
>
> Also it seems a bit wasteful to allocate large pipes in kernel mem,
> for catting small files.
>
> I think we could handle both these cases with the attached adjustment,
> which avoids splice for small regular files, and tries read()/write()
> after splice() always.  Note the size threshold was determined with:
>
>   $ truncate -s $((32*1024)) mid  $ hyperfine -N 'src/cat-splice mid' 
> 'src/cat mid'
>
> Since we're not doing any more stats() to determine this info,
> this should be more memory efficient and safer.

This seems reasonable.

However, there is an issue with your patch. Falling back to read() means
that the user will have to send two EOFs to exit the program. One for
the splice() call and one for the read() call. I forgot about this but
noticed it when tests/tty/tty-eof.pl failed after running 'make check'.

How about this patch which disables the use of splice() when the input
file descriptor is associated with a terminal? It feels a bit safer than
trusting the EOF from splice().

Collin

>From 572dc10a3b1e6c24cbd2fc40ed276c4095f518b6 Mon Sep 17 00:00:00 2001
Message-ID: <572dc10a3b1e6c24cbd2fc40ed276c4095f518b6.1775451127.git.collin.fu...@gmail.com>
From: Collin Funk <[email protected]>
Date: Sun, 29 Mar 2026 16:13:01 -0700
Subject: [PATCH v5] cat: use splice if operating on pipes or if
 copy_file_range fails

On a AMD Ryzen 7 3700X system:

    $ timeout 10 taskset 1 ./src/cat-prev /dev/zero \
        | taskset 2 pv -r > /dev/null
    [1.67GiB/s]
    $ timeout 10 taskset 1 ./src/cat /dev/zero \
        | taskset 2 pv -r > /dev/null
    [9.03GiB/s]

On a Power10 system:

    $ taskset 1 ./src/yes | timeout 10 taskset 2 ./src/cat-prev \
        | taskset 3 pv -r > /dev/null
    [12.9GiB/s]
    $ taskset 1 ./src/yes | timeout 10 taskset 2 ./src/cat \
            | taskset 3 pv -r > /dev/null
    [81.8GiB/s]

* NEWS: Mention the improvement.
* src/cat.c: Include isapipe.h, splice.h, and unistd--.h.
(splice_cat): New function.
(main): Use it.
* src/local.mk (noinst_HEADERS): Add src/splice.h.
* src/splice.h: New file, based on definitions from src/yes.c.
* src/yes.c: Include splice.h.
(pipe_splice_size): Use increase_pipe_size from src/splice.h.
(SPLICE_PIPE_SIZE): Remove definition, moved to src/splice.h.
* tests/cat/splice.sh: New file, based on some tests in
tests/misc/yes.sh.
* tests/local.mk (all_tests): Add the new test.
---
 NEWS                |   4 ++
 src/cat.c           | 131 +++++++++++++++++++++++++++++++++++++++++++-
 src/local.mk        |   1 +
 src/splice.h        |  41 ++++++++++++++
 src/yes.c           |  15 +----
 tests/cat/splice.sh |  66 ++++++++++++++++++++++
 tests/local.mk      |   1 +
 7 files changed, 243 insertions(+), 16 deletions(-)
 create mode 100644 src/splice.h
 create mode 100755 tests/cat/splice.sh

diff --git a/NEWS b/NEWS
index 98f0724e3..52a6dd3d1 100644
--- a/NEWS
+++ b/NEWS
@@ -39,6 +39,10 @@ GNU coreutils NEWS                                    -*- outline -*-
 
 ** Improvements
 
+  'cat' now uses zero-copy I/O on Linux when the input or output are pipes,
+  to significantly increase throughput.
+  E.g., throughput improved 6x from 12.9GiB/s to 81.8GiB/s on a Power10 system.
+
   'df --local' recognises more file system types as remote.
   Specifically: autofs, ncpfs, smb, smb2, gfs, gfs2, userlandfs.
 
diff --git a/src/cat.c b/src/cat.c
index f9c92005c..02b2acdc8 100644
--- a/src/cat.c
+++ b/src/cat.c
@@ -37,6 +37,9 @@
 #include "ioblksize.h"
 #include "fadvise.h"
 #include "full-write.h"
+#include "isapipe.h"
+#include "splice.h"
+#include "unistd--.h"
 #include "xbinary-io.h"
 
 /* The official name of this program (e.g., no 'g' prefix).  */
@@ -545,6 +548,107 @@ copy_cat (void)
       }
 }
 
+/* Copy data from input to output using splice if possible.
+   Return 1 if successful, 0 if ordinary read+write should be tried,
+   -1 if a serious problem has been diagnosed.  */
+
+static int
+splice_cat (void)
+{
+  bool some_copied = false;
+  bool in_ok = true;
+  bool out_ok = true;
+
+#if HAVE_SPLICE
+
+  static int stdout_is_pipe = -1;
+  static idx_t stdout_pipe_size = 0;
+  if (stdout_is_pipe == -1)
+    {
+      stdout_is_pipe = 0 < isapipe (STDOUT_FILENO);
+      if (stdout_is_pipe)
+        stdout_pipe_size = increase_pipe_size (STDOUT_FILENO);
+    }
+
+  bool input_is_pipe = 0 < isapipe (input_desc);
+
+  idx_t pipe_size = stdout_pipe_size;
+  if (input_is_pipe)
+    pipe_size = MAX (pipe_size, increase_pipe_size (input_desc));
+
+  int pipefd[2] = { -1, -1 };
+
+  /* Create an intermediate pipe.
+     Even if both input and output are pipes,
+     so that read and write errors can be distinguished.  */
+  if (pipe (pipefd) < 0)
+    return false;
+  pipe_size = MAX (pipe_size, increase_pipe_size (pipefd[1]));
+
+  while (true)
+    {
+      ssize_t bytes_read = splice (input_desc, NULL, pipefd[1], NULL,
+                                   pipe_size, 0);
+      /* If we successfully splice'd input previously, assume that any
+         subsequent error is fatal.  If not, then fall back to read
+         and write.  */
+      in_ok = 0 <= bytes_read || ! some_copied;
+      if (bytes_read <= 0)
+        goto done;
+      /* We need to drain the intermediate pipe to standard output.  */
+      while (0 < bytes_read)
+        {
+          ssize_t bytes_written = splice (pipefd[0], NULL, STDOUT_FILENO, NULL,
+                                          pipe_size, 0);
+          /* If we successfully splice'd output, assume any subsequent
+             error is fatal.  If not, than drain the intermediate pipe and
+             continue using read and write.  */
+          if (bytes_written < 0)
+            {
+              if (some_copied)
+                out_ok = false;
+              else
+                {
+                  char buf[BUFSIZ];
+                  while (0 < bytes_read)
+                    {
+                      ssize_t count = MIN (bytes_read, sizeof buf);
+                      ssize_t n_read = read (pipefd[0], buf, count);
+                      /* Failure not associated with in or out.  */
+                      in_ok = out_ok = 0 <= n_read;
+                      if (n_read <= 0)
+                        goto done;
+                      if (full_write (STDOUT_FILENO, buf, n_read) != n_read)
+                        write_error ();
+                      bytes_read -= n_read;
+                    }
+                }
+            }
+          if (bytes_written <= 0)
+            goto done;
+          some_copied = true;
+          bytes_read -= bytes_written;
+        }
+    }
+
+ done:
+  if (! in_ok && ! out_ok)
+    error (0, errno, "%s", _("splice error"));
+  else if (! in_ok)
+    error (0, errno, "%s", quotef (infile));
+  else if (! out_ok)
+    write_error ();
+  if (0 <= pipefd[0])
+    {
+      int saved_errno = errno;
+      close (pipefd[0]);
+      close (pipefd[1]);
+      errno = saved_errno;
+    }
+#endif
+
+  return (in_ok && out_ok) ? some_copied : -1;
+}
 
 int
 main (int argc, char **argv)
@@ -760,9 +864,30 @@ main (int argc, char **argv)
             }
           else
             {
-              insize = MAX (insize, outsize);
-              inbuf = xalignalloc (page_size, insize);
-              ok &= simple_cat (inbuf, insize);
+              /* Note 32768 was determined as the limit when splice
+                 starts to have a performance advantage.  It also
+                 excludes zero length files which may not be compatible
+                 with splice in some edge cases.  Also, don't use splice
+                 on terminals where falling back to read() would require
+                 the user to send an EOF twice to exit the program.  */
+              int splice_cat_status = (((usable_st_size (&istat_buf)
+                                         && istat_buf.st_size < 32768)
+                                        || isatty (input_desc))
+                                       ? 0 : splice_cat ());
+              if (splice_cat_status < 0)
+                {
+                  inbuf = NULL;
+                  ok = false;
+                }
+              else
+                {
+                  /* Note we try simple_cat() even if splice_cat() succeeded,
+                     to handle edge cases where splice finishes but read()
+                     still returns data (seen on some FUSE systems).  */
+                  insize = MAX (insize, outsize);
+                  inbuf = xalignalloc (page_size, insize);
+                  ok &= simple_cat (inbuf, insize);
+                }
             }
         }
       else
diff --git a/src/local.mk b/src/local.mk
index bf88f7d0e..9d9c9814b 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -61,6 +61,7 @@ noinst_HEADERS =		\
   src/remove.h			\
   src/set-fields.h		\
   src/show-date.h		\
+  src/splice.h			\
   src/statx.h			\
   src/system.h			\
   src/temp-stream.h		\
diff --git a/src/splice.h b/src/splice.h
new file mode 100644
index 000000000..1fb55054d
--- /dev/null
+++ b/src/splice.h
@@ -0,0 +1,41 @@
+/* Common definitions for splice and vmsplice.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#ifndef SPLICE_H
+# define SPLICE_H 1
+
+# if HAVE_SPLICE
+
+/* Empirically determined pipe size for best throughput.
+   Needs to be <= /proc/sys/fs/pipe-max-size  */
+enum { SPLICE_PIPE_SIZE = 512 * 1024 };
+
+static inline idx_t
+increase_pipe_size (int fd)
+{
+  int pipe_cap = 0;
+#  if defined F_SETPIPE_SZ && defined F_GETPIPE_SZ
+  if ((pipe_cap = fcntl (fd, F_SETPIPE_SZ, SPLICE_PIPE_SIZE)) < 0)
+    pipe_cap = fcntl (fd, F_GETPIPE_SZ);
+#  endif
+  if (pipe_cap <= 0)
+    pipe_cap = 64 * 1024;
+  return pipe_cap;
+}
+
+# endif
+
+#endif
diff --git a/src/yes.c b/src/yes.c
index 1a1d74ce5..d111b125e 100644
--- a/src/yes.c
+++ b/src/yes.c
@@ -27,6 +27,7 @@
 #include "full-write.h"
 #include "isapipe.h"
 #include "long-options.h"
+#include "splice.h"
 #include "unistd--.h"
 
 /* The official name of this program (e.g., no 'g' prefix).  */
@@ -76,10 +77,6 @@ repeat_pattern (char *dest, char const *src, idx_t srcsize, idx_t bufsize)
 
 #if HAVE_SPLICE
 
-/* Empirically determined pipe size for best throughput.
-   Needs to be <= /proc/sys/fs/pipe-max-size  */
-enum { SPLICE_PIPE_SIZE = 512 * 1024 };
-
 /* Enlarge a pipe towards SPLICE_PIPE_SIZE and return the actual
    capacity as a quarter of the pipe size (the empirical sweet spot
    for vmsplice throughput), rounded down to a multiple of COPYSIZE.
@@ -88,15 +85,7 @@ enum { SPLICE_PIPE_SIZE = 512 * 1024 };
 static idx_t
 pipe_splice_size (int fd, idx_t copysize)
 {
-  int pipe_cap = 0;
-# if defined F_SETPIPE_SZ && defined F_GETPIPE_SZ
-  if ((pipe_cap = fcntl (fd, F_SETPIPE_SZ, SPLICE_PIPE_SIZE)) < 0)
-    pipe_cap = fcntl (fd, F_GETPIPE_SZ);
-# endif
-  if (pipe_cap <= 0)
-    pipe_cap = 64 * 1024;
-
-  size_t buf_cap = pipe_cap / 4;
+  size_t buf_cap = increase_pipe_size (fd) / 4;
   return buf_cap / copysize * copysize;
 }
 
diff --git a/tests/cat/splice.sh b/tests/cat/splice.sh
new file mode 100755
index 000000000..513a33181
--- /dev/null
+++ b/tests/cat/splice.sh
@@ -0,0 +1,66 @@
+#!/bin/sh
+# Test some cases where 'cat' uses the splice system call.
+
+# Copyright (C) 2026 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ cat
+getlimits_
+uses_strace_
+
+# Check the non pipe output case, since that is different with splice
+if timeout 10 true; then
+  timeout .1 cat /dev/zero >/dev/null
+  test $? = 124 || fail=1
+fi
+
+# Test that splice errors are diagnosed.
+# Odd numbers are for input, even for output
+if strace -o /dev/null -e inject=splice:error=EIO:when=3 true; then
+  for when in 3 4; do
+    test "$when" = 4 && efile='write error' || efile='/dev/zero'
+    printf 'cat: %s: %s\n' "$efile" "$EIO" > exp || framework_failure_
+    returns_ 1 timeout 10 strace -o /dev/null \
+      -e inject=splice:error=EIO:when=$when \
+      cat /dev/zero >/dev/null 2>err || fail=1
+    compare exp err || fail=1
+  done
+fi
+
+# Ensure we fallback to write() if there is an issue with (async) zero-copy
+zc_syscalls='io_uring_setup io_uring_enter io_uring_register memfd_create
+             sendfile splice tee vmsplice'
+syscalls=$(
+  for s in $zc_syscalls; do
+    strace -qe "$s" true >/dev/null 2>&1 && echo "$s"
+  done | paste -s -d,)
+
+no_zero_copy() {
+  strace -f -o /dev/null -e inject=${syscalls}:error=ENOSYS "$@"
+}
+if no_zero_copy true; then
+  test "$(no_zero_copy cat /dev/zero | head -c 2 | tr '\0' 'y')" = 'yy' \
+    || fail=1
+fi
+# Ensure we fallback to write() if there is an issue with pipe2()
+# For example if we don't have enough file descriptors available.
+no_pipe() { strace -f -o /dev/null -e inject=pipe,pipe2:error=EMFILE "$@"; }
+if no_pipe true; then
+  no_pipe timeout .1 cat /dev/zero >/dev/null
+  test $? = 124 || fail=1
+fi
+
+Exit $fail
diff --git a/tests/local.mk b/tests/local.mk
index 590978297..2e889e207 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -308,6 +308,7 @@ all_tests =					\
   tests/cat/cat-proc.sh				\
   tests/cat/cat-buf.sh				\
   tests/cat/cat-self.sh				\
+  tests/cat/splice.sh				\
   tests/misc/basename.pl			\
   tests/basenc/base64.pl			\
   tests/basenc/basenc.pl			\
-- 
2.53.0

Reply via email to