Hello All, This is the revised version.
Both '--fiemap' and '--fiemap-sync' options were eliminated in this version. Now cp(1) will perform fiemap optimization whenever possible if cp(1) invoked with --sparse=[WHEN], otherwise, it will fall back to the standard copy. I have run 'make syntax-check' to ensure the code style is ok. The tests were run against 'btrfs', 'ocfs2' and 'ext4', each mounted as physical partitions. /dev/sda9 on /btrfs type btrfs (rw) /dev/sda8 on /ocfs2 type ocfs2 (rw,_netdev,nointr,user_xattr,acl,heartbeat=local) /dev/sda11 on /ext4 type ext4 (rw,acl,user_xattr) j...@jeff-laptop:~/opensource_dev/coreutils$ uname -a Linux jeff-laptop 2.6.33-rc5-00238-gb04da8b-dirty #11 SMP Sat Dec 19 22:02:01 CST 2009 i686 GNU/Linux For the following performance compares, './src/cp' means the fiemap enabled `cp', and '/bin/cp' is the normal one which build against the git upstream source code. The performance are good for 'ocfs2' and 'ext4'. but for 'btrfs', it does not shown not as expected. Tests: ====== j...@jeff-laptop:~/opensource_dev/coreutils$ dd if=/dev/zero of=/ocfs2/sparse_2 bs=1k count=1 seek=10022 1+0 records in 1+0 records out 1024 bytes (1.0 kB) copied, 0.000382239 s, 2.7 MB/s j...@jeff-laptop:~/opensource_dev/coreutils$ time ./src/cp --sparse=always /btrfs/sparse_2 /btrfs/sparse_2_fiemap real 0m0.252s user 0m0.004s sys 0m0.232s j...@jeff-laptop:~/opensource_dev/coreutils$ time /bin/cp --sparse=always /btrfs/sparse_2 /btrfs/sparse_2_normal real 0m0.069s user 0m0.004s sys 0m0.036s j...@jeff-laptop:~/opensource_dev/coreutils$ time ./src/cp --sparse=always /ocfs2/sparse_2 /ocfs2/sparse_2_fiemap real 0m0.019s user 0m0.000s sys 0m0.012s j...@jeff-laptop:~/opensource_dev/coreutils$ time /bin/cp --sparse=always /ocfs2/sparse_2 /ocfs2/sparse_2_normal real 0m0.103s user 0m0.000s sys 0m0.092s j...@jeff-laptop:~/opensource_dev/coreutils$ time /bin/cp --sparse=always /ext4/sparse_2 /ext4/sparse_2_normal real 0m0.103s user 0m0.000s sys 0m0.064s j...@jeff-laptop:~/opensource_dev/coreutils$ time ./src/cp --sparse=always /ext4/sparse_2 /ext4/sparse_2_fiemap real 0m0.012s user 0m0.000s sys 0m0.008s I also tried to test a sparse file which fills with holes in the middle. j...@jeff-laptop:~/opensource_dev/coreutils$ ls -l /btrfs/sparse_4 -rw-r-xr-- 1 jeff jeff 838860800 Apr 17 14:24 /btrfs/sparse_4 j...@jeff-laptop:~/opensource_dev/coreutils$ time ./src/cp --sparse=always /btrfs/sparse_4 /btrfs/sparse_4_fiemap real 0m43.227s user 0m0.156s sys 0m21.749s j...@jeff-laptop:~/opensource_dev/coreutils$ time /bin/cp --sparse=always /btrfs/sparse_4 /btrfs/sparse_4_normal real 0m18.282s user 0m0.232s sys 0m14.301s j...@jeff-laptop:~/opensource_dev/coreutils$ time sudo /bin/cp --sparse=always /ocfs2/sparse_4 /ocfs2/sparse_4_normal real 0m7.583s user 0m0.216s sys 0m7.292s j...@jeff-laptop:~/opensource_dev/coreutils$ time sudo src/cp --sparse=always /ocfs2/sparse_4 /ocfs2/sparse_4_fiemap real 0m0.133s user 0m0.000s sys 0m0.076s j...@jeff-laptop:~/opensource_dev/du_enhance_project/coreutils$ ls -l /ocfs2/sparse_4* -rw-r-xr-- 1 root root 838860800 Apr 16 22:52 /ocfs2/sparse_4 -rw-r-xr-- 1 jeff jeff 838860800 Apr 17 14:26 /ocfs2/sparse_4_fiemap -rw-r-xr-- 1 jeff jeff 838860800 Apr 17 14:27 /ocfs2/sparse_4_normal j...@jeff-laptop:~/opensource_dev/coreutils$ rdiff signature /ocfs2/sparse_4 /ocfs2/sparse_4_fiemap j...@jeff-laptop:~/opensource_dev/coreutils$ echo $? 0 j...@jeff-laptop:~/opensource_dev/coreutils$ time /bin/cp --sparse=always /ext4/sparse_4 /ext4/sparse_4_normal real 0m7.258s user 0m0.244s sys 0m4.468s j...@jeff-laptop:~/opensource_dev/coreutils$ time ./src/cp --sparse=always /ext4/sparse_4 /ext4/sparse_4_fiemap real 0m0.173s user 0m0.000s sys 0m0.052s >From 95f26596e932b87b2da1695086af156f24ceadfa Mon Sep 17 00:00:00 2001 From: Jie Liu <jeff....@oracle.com> Date: Sat, 17 Apr 2010 16:19:49 +0800 Subject: [PATCH 1/1] Introduct fiemap ioctl(2) for efficient sparse file copy v3 Signed-off-by: Jie Liu <jeff....@oracle.com> --- src/copy.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/fiemap.h | 102 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 243 insertions(+), 0 deletions(-) create mode 100644 src/fiemap.h diff --git a/src/copy.c b/src/copy.c index 3c32fa3..b754d9e 100644 --- a/src/copy.c +++ b/src/copy.c @@ -65,6 +65,10 @@ # include <sys/ioctl.h> #endif +#ifndef HAVE_FIEMAP +# include "fiemap.h" +#endif + #ifndef HAVE_FCHOWN # define HAVE_FCHOWN false # define fchown(fd, uid, gid) (-1) @@ -151,6 +155,130 @@ clone_file (int dest_fd, int src_fd) #endif } +#ifdef __linux__ +# ifndef FS_IOC_FIEMAP +# define FS_IOC_FIEMAP _IOWR ('f', 11, struct fiemap) +# endif +/* Perform FIEMAP(available in mainline 2.6.27) copy if possible. + Call ioctl(2) with FS_IOC_FIEMAP to efficiently map file allocation + excepts holes. So the overhead to deal with holes with lseek(2) in + normal copy could be saved. This would result in much faster backups + for any kind of sparse file. */ +static bool +fiemap_copy_ok (int src_fd, int dest_fd, size_t buf_size, + off_t src_total_size, char const *src_name, + char const *dst_name) +{ + int last = 0; + unsigned int i; + bool return_val = true; + char fiemap_buf[4096] = ""; + struct fiemap *fiemap = (struct fiemap *)fiemap_buf; + struct fiemap_extent *fm_ext = &fiemap->fm_extents[0]; + uint32_t count = (sizeof (fiemap_buf) - sizeof (*fiemap)) / + sizeof (struct fiemap_extent); + uint64_t last_ext_logical = 0; + uint64_t last_ext_len = 0; + uint64_t last_read_size = 0; + + memset (fiemap, 0, sizeof (*fiemap)); + + do + { + fiemap->fm_start = 0ULL; + fiemap->fm_length = FIEMAP_MAX_OFFSET; + fiemap->fm_extent_count = count; + + if (ioctl (src_fd, FS_IOC_FIEMAP, (unsigned long) fiemap) < 0) + return false; + + /* If 0 extents are returned, then more ioctls are not needed. */ + if (fiemap->fm_mapped_extents == 0) + return true; + + for (i = 0; i < fiemap->fm_mapped_extents; i++) + { + uint64_t ext_logical = fm_ext[i].fe_logical; + uint64_t ext_len = fm_ext[i].fe_length; + + if (lseek (src_fd, (off_t) ext_logical, SEEK_SET) < 0LL) + { + error (0, errno, _("cannot lseek %s"), quote (src_name)); + return_val = false; + } + + if (lseek (dest_fd, (off_t) ext_logical, SEEK_SET) < 0LL) + { + error (0, errno, _("cannot lseek %s"), quote (dst_name)); + return_val = false; + } + + if (fm_ext[i].fe_flags & FIEMAP_EXTENT_LAST) + { + last_ext_logical = ext_logical; + last_ext_len = ext_len; + last = 1; + } + + char buf[buf_size]; + while (0 < ext_len) + { + memset (buf, 0, sizeof (buf)); + + /* Avoid reading into the holes if the left extent + length is shorter than the buffer size. */ + if (ext_len < buf_size) + buf_size = ext_len; + + ssize_t n_read = read (src_fd, buf, buf_size); + if (n_read < 0) + { +#ifdef EINTR + if (errno == EINTR) + continue; +#endif + error (0, errno, _("reading %s"), quote (src_name)); + return_val = false; + } + + if (n_read == 0) + { + /* Figure out how many bytes read from the last extent. */ + last_read_size = last_ext_len - ext_len; + break; + } + + if (full_write (dest_fd, buf, n_read) != n_read) + { + error (0, errno, _("writing %s"), quote (dst_name)); + return_val = false; + } + + ext_len -= n_read; + } + + fiemap->fm_start = (fm_ext[i-1].fe_logical + fm_ext[i-1].fe_length); + } + } while (last == 0); + + /* If a file ends up with holes, the sum of the last extent logical offset + and the read returned size should shorter than the actual size of the file. + We should sets the file size to ((struct stat) st_buf.st_size). */ + if (last_ext_logical + last_read_size < src_total_size) + { + if (ftruncate (dest_fd, src_total_size) < 0) + { + error (0, errno, _("truncating %s"), quote (dst_name)); + return_val = false; + } + } + + return return_val; +} +#else +static bool fiemap_copy_ok (ignored) { errno == ENOTSUP; return false; } +#endif + /* FIXME: describe */ /* FIXME: rewrite this to use a hash table so we avoid the quadratic performance hit that's probably noticeable only on trees deeper @@ -679,6 +807,16 @@ copy_reg (char const *src_name, char const *dst_name, #endif } + if (make_holes) + { + /* Perform efficient FIEMAP copy for sparse files, fall back to the + standard copy if fails. */ + if (fiemap_copy_ok (source_desc, dest_desc, + buf_size, src_open_sb.st_size, + src_name, dst_name)) + goto preserve_metadata; + } + /* If not making a sparse file, try to use a more-efficient buffer size. */ if (! make_holes) @@ -807,6 +945,8 @@ copy_reg (char const *src_name, char const *dst_name, } } +preserve_metadata: + if (x->preserve_timestamps) { struct timespec timespec[2]; @@ -897,6 +1037,7 @@ close_src_desc: free (buf_alloc); free (name_alloc); + return return_val; } diff --git a/src/fiemap.h b/src/fiemap.h new file mode 100644 index 0000000..d33293b --- /dev/null +++ b/src/fiemap.h @@ -0,0 +1,102 @@ +/* FS_IOC_FIEMAP ioctl infrastructure. + Some portions copyright (C) 2007 Cluster File Systems, Inc + Authors: Mark Fasheh <mfas...@suse.com> + Kalpak Shah <kalpak.s...@sun.com> + Andreas Dilger <adil...@sun.com>. */ + +/* Copy from kernel, modified to respect GNU code style by Jie Liu. */ + +#ifndef _LINUX_FIEMAP_H +# define _LINUX_FIEMAP_H + +# include <linux/types.h> + +struct fiemap_extent +{ + /* Logical offset in bytes for the start of the extent + from the beginning of the file. */ + uint64_t fe_logical; + + /* Physical offset in bytes for the start of the extent + from the beginning of the disk. */ + uint64_t fe_physical; + + /* Length in bytes for this extent. */ + uint64_t fe_length; + + uint64_t fe_reserved64[2]; + + /* FIEMAP_EXTENT_* flags for this extent. */ + uint32_t fe_flags; + + uint32_t fe_reserved[3]; +}; + +struct fiemap +{ + /* Logical offset(inclusive) at which to start mapping(in). */ + uint64_t fm_start; + + /* Logical length of mapping which userspace wants(in). */ + uint64_t fm_length; + + /* FIEMAP_FLAG_* flags for request(in/out). */ + uint32_t fm_flags; + + /* Number of extents that were mapped(out). */ + uint32_t fm_mapped_extents; + + /* Size of fm_extents array(in). */ + uint32_t fm_extent_count; + + uint32_t fm_reserved; + + /* Array of mapped extents(out). */ + struct fiemap_extent fm_extents[0]; +}; + +/* The maximum offset can be mapped for a file. */ +# define FIEMAP_MAX_OFFSET (~0ULL) + +/* Sync file data before map. */ +# define FIEMAP_FLAG_SYNC 0x00000001 + +/* Map extented attribute tree. */ +# define FIEMAP_FLAG_XATTR 0x00000002 + +# define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) + +/* Last extent in file. */ +# define FIEMAP_EXTENT_LAST 0x00000001 + +/* Data location unknown. */ +# define FIEMAP_EXTENT_UNKNOWN 0x00000002 + +/* Location still pending, Sets EXTENT_UNKNOWN. */ +# define FIEMAP_EXTENT_DELALLOC 0x00000004 + +/* Data can not be read while fs is unmounted. */ +# define FIEMAP_EXTENT_ENCODED 0x00000008 + +/* Data is encrypted by fs. Sets EXTENT_NO_BYPASS. */ +# define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 + +/* Extent offsets may not be block aligned. */ +# define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 + +/* Data mixed with metadata. Sets EXTENT_NOT_ALIGNED. */ +# define FIEMAP_EXTENT_DATA_INLINE 0x00000200 + +/* Multiple files in block. Set EXTENT_NOT_ALIGNED. */ +# define FIEMAP_EXTENT_DATA_TAIL 0x00000400 + +/* Space allocated, but not data (i.e. zero). */ +# define FIEMAP_EXTENT_UNWRITTEN 0x00000800 + +/* File does not natively support extents. Result merged for efficiency. */ +# define FIEMAP_EXTENT_MERGED 0x00001000 + +/* Space shared with other files. */ +# define FIEMAP_EXTENT_SHARED 0x00002000 + +#endif -- 1.5.4.3 Jim Meyering wrote: > jeff.liu wrote: >> [...explanation...] > > Thanks. > > ... >>> Run "make syntax-check", or even "make distcheck" > ... >> Sorry to make this stupidly mistake again, I will do double check for >> furture patches submition. > > It's not a big deal. > Notice that you can probably tell your editor > not to do that anymore. > > I've just adjusted the Cc list to include bug-coreutils. > Let's keep the discussion there, for the sake of the bug tracker. > > > > >