On 8/22/25 07:04, Pádraig Brady wrote:
/* Copy this extent, looking for further opportunities to not
bother to write zeros if --sparse=always, since SEEK_HOLE
is conservative and may miss some holes. */
So the comment needs to be tweaked, but a more general issue
is that it disables copy offloading (copy_file_range) for sparse files.
Ouch, I didn't see that. That's a real loss. I installed the first
attached patch to revert that part of my recent change.
I assume the part of the change that always punches holes is OK. I
couldn't see why one would not want to punch a hole if one has already
taken the trouble to find and create the hole.
BTW commit 26bf557 also changed this a couple of weeks ago
without updating the comment, so the comment relates to sparse_mode !=
SPARSE_ALWAYS.
Not quite following but I hope the comment is OK now with the first
patch installed.
squashfs is giving the wrong info here after all,
right?
Yes, that's the actual space-performance bug here. I installed the
second attached patch to try to work around it.
Jeremy, can you please try these two further patches? Thanks.
From 306de6c2619e2a9339ade9a88d55c4940942d516 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 22 Aug 2025 10:37:50 -0700
Subject: [PATCH 1/2] cp: go back to copy_file_range optimization
This reverts part of the previous change.
* src/copy.c (lseek_copy): When calling sparse_copy, do not
ask it to scan for zeros unless --sparse=always, so that it
can use copy_file_range which can be far more efficient.
---
src/copy.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/copy.c b/src/copy.c
index e328bd886..134c57c8b 100644
--- a/src/copy.c
+++ b/src/copy.c
@@ -617,7 +617,7 @@ lseek_copy (int src_fd, int dest_fd, char **abuf, size_t buf_size,
if ( ! sparse_copy (src_fd, dest_fd, abuf, buf_size,
allow_reflink, src_name, dst_name,
ext_len,
- sparse_mode != SPARSE_NEVER ? hole_size : nullptr,
+ sparse_mode == SPARSE_ALWAYS ? hole_size : nullptr,
&n_read))
return false;
--
2.50.1
From 39f22fe687ea0c226e3fb35e86cd5ea329180b80 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 22 Aug 2025 17:34:04 -0700
Subject: [PATCH 2/2] cp: improve hole handling on squashfs
Better fix for problem reported by Jeremy Allison
<https://bugs.gnu.org/79267>.
* src/copy.c (struct scan_inference): New type, replacing
union scan_inference. All uses changed. This is so
infer_scantype can report the first hole's offset when known.
(lseek_copy): 5th arg is now struct scan_inference const *,
not just off_t. All uses changed.
(infer_scantype): If SEEK_SET+SEEK_HOLE do not find a hole,
fall back on ZERO_SCANTYPE.
---
NEWS | 4 ++++
src/copy.c | 60 +++++++++++++++++++++++++++++++++++-------------------
2 files changed, 43 insertions(+), 21 deletions(-)
diff --git a/NEWS b/NEWS
index 5724698bf..2a8056658 100644
--- a/NEWS
+++ b/NEWS
@@ -17,6 +17,10 @@ GNU coreutils NEWS -*- outline -*-
data zeros used extents rather than holes.
[This bug was present in "the beginning".]
+ cp missed opportunities to create holes when copying from file
+ systems like squashfs that support SEEK_HOLE only trivially.
+ [bug introduced in coreutils-9.0]
+
cp, install, and mv now avoid possible data corruption on
glibc 2.41 and 2.42 systems when copy_file_range is used with ranges > 2GiB,
avoiding https://sourceware.org/PR33245
diff --git a/src/copy.c b/src/copy.c
index 134c57c8b..73ded6857 100644
--- a/src/copy.c
+++ b/src/copy.c
@@ -106,6 +106,16 @@
# define CAN_HARDLINK_SYMLINKS 0
#endif
+/* Result of infer_scantype when it returns LSEEK_SCANTYPE. */
+struct scan_inference
+{
+ /* The offset of the first data block, or -1 if the file has no data. */
+ off_t ext_start;
+
+ /* The offset of the first hole. */
+ off_t hole_start;
+};
+
struct dir_list
{
struct dir_list *parent;
@@ -540,7 +550,7 @@ write_zeros (int fd, off_t n_bytes)
static bool
lseek_copy (int src_fd, int dest_fd, char **abuf, size_t buf_size,
- off_t ext_start, off_t src_total_size,
+ struct scan_inference const *scan_inference, off_t src_total_size,
enum Sparse_type sparse_mode,
bool allow_reflink,
char const *src_name, char const *dst_name,
@@ -552,9 +562,11 @@ lseek_copy (int src_fd, int dest_fd, char **abuf, size_t buf_size,
copy_debug.sparse_detection = COPY_DEBUG_EXTERNAL;
- while (0 <= ext_start)
+ for (off_t ext_start = scan_inference->ext_start; 0 <= ext_start; )
{
- off_t ext_end = lseek (src_fd, ext_start, SEEK_HOLE);
+ off_t ext_end = (ext_start == 0
+ ? scan_inference->hole_start
+ : lseek (src_fd, ext_start, SEEK_HOLE));
if (ext_end < 0)
{
if (errno != ENXIO)
@@ -1085,23 +1097,13 @@ enum scantype
LSEEK_SCANTYPE,
};
-/* Result of infer_scantype. */
-union scan_inference
-{
- /* Used if infer_scantype returns LSEEK_SCANTYPE. This is the
- offset of the first data block, or -1 if the file has no data. */
- off_t ext_start;
-};
-
/* Return how to scan a file with descriptor FD and stat buffer SB.
- *SCAN_INFERENCE is set to a valid value if returning LSEEK_SCANTYPE. */
+ Set *SCAN_INFERENCE if returning LSEEK_SCANTYPE. */
static enum scantype
infer_scantype (int fd, struct stat const *sb,
- union scan_inference *scan_inference)
+ struct scan_inference *scan_inference)
{
- scan_inference->ext_start = -1; /* avoid -Wmaybe-uninitialized */
-
- /* Only attempt SEEK_HOLE if this heuristic
+ /* Try SEEK_HOLE only if this heuristic
suggests the file is sparse. */
if (! (HAVE_STRUCT_STAT_ST_BLOCKS
&& S_ISREG (sb->st_mode)
@@ -1109,10 +1111,26 @@ infer_scantype (int fd, struct stat const *sb,
return PLAIN_SCANTYPE;
#ifdef SEEK_HOLE
- off_t ext_start = lseek (fd, 0, SEEK_DATA);
- if (0 <= ext_start || errno == ENXIO)
+ scan_inference->ext_start = lseek (fd, 0, SEEK_DATA);
+ if (scan_inference->ext_start == 0)
+ {
+ scan_inference->hole_start = lseek (fd, 0, SEEK_HOLE);
+ if (0 <= scan_inference->hole_start)
+ {
+ if (scan_inference->hole_start < sb->st_size)
+ return LSEEK_SCANTYPE;
+
+ /* Though the file likely has holes, SEEK_DATA and SEEK_HOLE
+ didn't find any. This can happen with file systems like
+ circa-2025 squashfs that support SEEK_HOLE only trivially.
+ Fall back on ZERO_SCANTYPE. */
+ if (lseek (fd, 0, SEEK_SET) < 0)
+ return ERROR_SCANTYPE;
+ }
+ }
+ else if (0 < scan_inference->ext_start || errno == ENXIO)
{
- scan_inference->ext_start = ext_start;
+ scan_inference->hole_start = 0; /* Pacify -Wmaybe-uninitialized. */
return LSEEK_SCANTYPE;
}
else if (errno != EINVAL && !is_ENOTSUP (errno))
@@ -1209,7 +1227,6 @@ copy_reg (char const *src_name, char const *dst_name,
mode_t extra_permissions;
struct stat sb;
struct stat src_open_sb;
- union scan_inference scan_inference;
bool return_val = true;
bool data_copy_required = x->data_copy_required;
bool preserve_xattr = USE_XATTR & x->preserve_xattr;
@@ -1518,6 +1535,7 @@ copy_reg (char const *src_name, char const *dst_name,
size_t buf_size = io_blksize (&sb);
/* Deal with sparse files. */
+ struct scan_inference scan_inference;
enum scantype scantype = infer_scantype (source_desc, &src_open_sb,
&scan_inference);
if (scantype == ERROR_SCANTYPE)
@@ -1565,7 +1583,7 @@ copy_reg (char const *src_name, char const *dst_name,
#ifdef SEEK_HOLE
scantype == LSEEK_SCANTYPE
? lseek_copy (source_desc, dest_desc, &buf, buf_size,
- scan_inference.ext_start, src_open_sb.st_size,
+ &scan_inference, src_open_sb.st_size,
make_holes ? x->sparse_mode : SPARSE_NEVER,
x->reflink_mode != REFLINK_NEVER,
src_name, dst_name, &hole_size, &n_read)
--
2.50.1