Hi,

Here's a patch for cp which adds a new --preallocate option. When
specified, cp allocates disk space for the destination file before writing
data. It uses fallocate() with FALLOC_FL_KEEP_SIZE on Linux, falling back
to posix_fallocate() if that fails.

Benefits of preallocation:
 - Disk fragmentation can be greatly reduced. That means faster file
access and less filesystem overhead (fewer extents).
 - Recovering data after filesystem corruption should be more successful,
since files are more likely to be contiguous.
 - If you're e.g. copying a virtual machine disk image file, the
destination should be (almost) contiguous, meaning that running a disk
optimiser/defragmenter in the guest OS would work as it should (i.e.
improve performance).

This is a very preliminary patch for testing. Hopefully someone will find
it useful. And hopefully someone who (a) has a clue when it comes to C
programming, and (b) is familiar with the coreutils source (I'm neither)
can work from this to produce something which could be included in a
future release.

Note that posix_fallocate() sets the destination file size. If your system
doesn't support fallocate() with FALLOC_FL_KEEP_SIZE, you can't e.g. do
"ls -l destfilename" to monitor the progress of a large file copy; the
length shown will always be the final length.

Pre-allocating space can defeat the object of --sparse=always (or the
default sparse-checking heuristic). If copying files with large holes you
probably won't want to use --preallocate. If you do, regions in the
destination corresponding to holes in the source will be allocated but
unwritten. You'll lose the disk-space-saving benefit, but keep the
fast-reading-of-holes benefit. On the other hand, that feature could be
useful sometimes.

In the general case of copying non-sparse files, it should be beneficial
to use --preallocate. However on some systems, when the destination
filesystem does not support pre-allocation (e.g. FAT32), the
implementation of posix_fallocate() might try to fill the region to be
pre-allocated with zeros. That would double copy time for no benefit.

To-do list:
 - Add --preallocate option to mv as well
 - Should the option name be changed to --pre-allocate?
 - Maybe have an option to tell cp to pre-allocate space for all
destination files in one go, rather than pre-allocating space for each
individual file before copying?
 - Check the error code that fallocate() returns. If it says the
filesystem does not support fallocate(), don't call it again for every
other file being copied.
 - Better handling of sparse files, e.g. don't call fallocate() if source
file is sparse and --sparse=always is given.
 - If pre-allocation fails due to insufficient disk space, cp prints a
message and continues. So typically it will fill up the disk then abort
with an out-of-disk-space error. It would be nice to be able to tell cp
to abort when a pre-allocation fails, so it can exit without wasting
time.

The attached patch is based on coreutils 8.17.


-- Mark

diff -Naur coreutils-8.17/src/copy.c my_coreutils-8.17/src/copy.c
--- coreutils-8.17/src/copy.c   2012-05-08 09:34:30.000000000 +0100
+++ my_coreutils-8.17/src/copy.c        2012-05-11 13:52:57.925208868 +0100
@@ -23,6 +23,17 @@
 #include <sys/types.h>
 #include <selinux/selinux.h>
 
+/* TODO: Make this a configure/makefile option. */
+#define USE_LINUX_FALLOCATE 1
+
+/* For FALLOC_FL_KEEP_SIZE definition. Don't include linux/falloc.h because
+   that might not exist on some systems. */
+#ifdef USE_LINUX_FALLOCATE
+#ifndef FALLOC_FL_KEEP_SIZE
+#define FALLOC_FL_KEEP_SIZE 1
+#endif
+#endif
+
 #if HAVE_HURD_H
 # include <hurd.h>
 #endif
@@ -1050,6 +1061,37 @@
       buf_alloc = xmalloc (buf_size + buf_alignment_slop);
       buf = ptr_align (buf_alloc, buf_alignment);
 
+      /* Allocate space for the destination file if user specified the
+         --preallocate option.
+         TODO: Add an option to have cp abort if unable to allocate space,
+         instead of just printing a message. */
+
+      /* On Linux, we can use fallocate() with FALLOC_FL_KEEP_SIZE to pre-
+         allocate space without changing the apparent file size. */
+      if (x->preallocate)
+        {
+#ifdef USE_LINUX_FALLOCATE
+          if (fallocate(dest_desc, FALLOC_FL_KEEP_SIZE, 0,
+                        src_open_sb.st_size) == 0)
+            goto allocated;
+          else
+          /* TODO: Handle different errors better. E.g. if the kernel does not
+             support FALLOC_FL_KEEP_SIZE we could retry without that flag, or
+             if the filesystem does not support fallocate() we could remember
+             that and not call fallocate() for every file copied. */
+            error (0, errno, _("pre-allocating space for %s"),
+                   quote (dst_name));
+          /* If fallocate() with FALLOC_FL_KEEP_SIZE failed, fall back to
+             posix_fallocate(). */
+#else
+          if (posix_fallocate(dest_desc, 0, src_open_sb.st_size) != 0)
+            /* posix_fallocate() does not set errno */
+            fprintf (stderr, _("%s: Could not pre-allocate space for %s"),
+                     program_name, quote (dst_name));
+#endif
+        }
+
+allocated:
       if (sparse_src)
         {
           bool normal_copy_required;
diff -Naur coreutils-8.17/src/copy.h my_coreutils-8.17/src/copy.h
--- coreutils-8.17/src/copy.h   2012-03-24 18:22:13.000000000 +0000
+++ my_coreutils-8.17/src/copy.h        2012-05-11 13:08:42.788672598 +0100
@@ -242,6 +242,9 @@
      such a symlink) and returns false.  */
   bool open_dangling_dest_symlink;
 
+  /* Use fallocate()/posix_fallocate() to pre-allocate space for files */
+  bool preallocate;
+
   /* Control creation of COW files.  */
   enum Reflink_type reflink_mode;
 
diff -Naur coreutils-8.17/src/cp.c my_coreutils-8.17/src/cp.c
--- coreutils-8.17/src/cp.c     2012-05-01 21:55:08.000000000 +0100
+++ my_coreutils-8.17/src/cp.c  2012-05-11 13:53:21.089396098 +0100
@@ -80,7 +80,8 @@
   REFLINK_OPTION,
   SPARSE_OPTION,
   STRIP_TRAILING_SLASHES_OPTION,
-  UNLINK_DEST_BEFORE_OPENING
+  UNLINK_DEST_BEFORE_OPENING,
+  PREALLOCATE_OPTION
 };
 
 /* True if the kernel is SELinux enabled.  */
@@ -90,6 +91,9 @@
    as its destination instead of the usual "e_dir/e_file." */
 static bool parents_option = false;
 
+/* For --preallocate option */
+static bool preallocate_option = false;
+
 /* Remove any trailing slashes from each SOURCE argument.  */
 static bool remove_trailing_slashes;
 
@@ -130,6 +134,7 @@
   {"one-file-system", no_argument, NULL, 'x'},
   {"parents", no_argument, NULL, PARENTS_OPTION},
   {"path", no_argument, NULL, PARENTS_OPTION},   /* Deprecated.  */
+  {"preallocate", no_argument, NULL, PREALLOCATE_OPTION},
   {"preserve", optional_argument, NULL, PRESERVE_ATTRIBUTES_OPTION},
   {"recursive", no_argument, NULL, 'R'},
   {"remove-destination", no_argument, NULL, UNLINK_DEST_BEFORE_OPENING},
@@ -195,6 +200,15 @@
   -P, --no-dereference         never follow symbolic links in SOURCE\n\
 "), stdout);
       fputs (_("\
+      --preallocate            pre-allocate space for destination files 
before\n\
+                                 copying data. This can significantly reduce\n\
+                                 fragmentation and allows an early exit if\n\
+                                 there would not be enough free space. It 
can\n\
+                                 also increase the likelihood of successful\n\
+                                 data recovery after filesystem corruption\n\
+                                 since data is more likely to be contiguous.\n\
+"), stdout);
+      fputs (_("\
   -p                           same as --preserve=mode,ownership,timestamps\n\
       --preserve[=ATTR_LIST]   preserve the specified attributes (default:\n\
                                  mode,ownership,timestamps), if possible\n\
@@ -779,6 +793,7 @@
   x->one_file_system = false;
   x->reflink_mode = REFLINK_NEVER;
 
+  x->preallocate = false;
   x->preserve_ownership = false;
   x->preserve_links = false;
   x->preserve_mode = false;
@@ -1040,6 +1055,10 @@
           parents_option = true;
           break;
 
+        case PREALLOCATE_OPTION:
+          x.preallocate = true;
+          break;
+
         case 'r':
         case 'R':
           x.recursive = true;
diff -Naur coreutils-8.17/src/mv.c my_coreutils-8.17/src/mv.c
--- coreutils-8.17/src/mv.c     2012-05-01 21:55:08.000000000 +0100
+++ my_coreutils-8.17/src/mv.c  2012-05-11 13:09:54.069217905 +0100
@@ -132,6 +132,7 @@
   x->stdin_tty = isatty (STDIN_FILENO);
 
   x->open_dangling_dest_symlink = false;
+  x->preallocate = false;       /* FIXME: Add support for --preallocate */
   x->update = false;
   x->verbose = false;
   x->dest_info = NULL;

Reply via email to