Hi Jim, Thanks for your kind advise!
I'd like to adopt the timeout(1) approach for the test work. My thought is: 1. Create and mount a file-backed ext4 partition rather than relying on the HARD CODE path. 2. Create a 2gb sparse file without extent allocated for it. 3. It take nearly 30 seconds to transfer this file in normal copy, yet less than 1 second through FIEMAP-copy, is it a worst-case scenario that makes the difference as large as possible? 4. run FIEMAP-copy, use timeout(1) to limit it will complete in 1 second, I hope I understood your opinion correctly ;). The revised patches are shown as following: >From b683f930c5e70481c2b6e000a626734f975b99ac Mon Sep 17 00:00:00 2001 From: Jie Liu <jeff....@oracle.com> Date: Thu, 13 May 2010 22:09:30 +0800 Subject: [PATCH 1/1] cp: Add FIEMAP support for efficient sparse file copy * src/fiemap.h: Add fiemap.h for fiemap ioctl(2) support. Copied from linux's include/linux/fiemap.h, with minor formatting changes. * src/copy.c (copy_reg): Now, when `cp' invoked with --sparse=[WHEN] option, we will try to do FIEMAP-copy if the underlaying file system support it, fall back to a normal copy if it fails. Signed-off-by: Jie Liu <jeff....@oracle.com> --- src/copy.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/fiemap.h | 102 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 256 insertions(+), 0 deletions(-) create mode 100644 src/fiemap.h diff --git a/src/copy.c b/src/copy.c index c16cef6..960e5fb 100644 --- a/src/copy.c +++ b/src/copy.c @@ -63,6 +63,10 @@ #include <sys/ioctl.h> +#ifndef HAVE_FIEMAP +# include "fiemap.h" +#endif + #ifndef HAVE_FCHOWN # define HAVE_FCHOWN false # define fchown(fd, uid, gid) (-1) @@ -149,6 +153,136 @@ clone_file (int dest_fd, int src_fd) #endif } +#ifdef __linux__ +# ifndef FS_IOC_FIEMAP +# define FS_IOC_FIEMAP _IOWR ('f', 11, struct fiemap) +# endif +/* Perform FIEMAP(available in mainline 2.6.27) copy if possible. + Call ioctl(2) with FS_IOC_FIEMAP to efficiently map file allocation + excepts holes. So the overhead to deal with holes with lseek(2) in + normal copy could be saved. This would result in much faster backups + for any kind of sparse file. */ +static bool +fiemap_copy_ok (int src_fd, int dest_fd, size_t buf_size, + off_t src_total_size, char const *src_name, + char const *dst_name, bool *normal_copy_required) +{ + bool fail = false; + bool last = false; + char fiemap_buf[4096]; + struct fiemap *fiemap = (struct fiemap *)fiemap_buf; + struct fiemap_extent *fm_ext = &fiemap->fm_extents[0]; + uint32_t count = (sizeof (fiemap_buf) - sizeof (*fiemap)) / + sizeof (struct fiemap_extent); + off_t last_ext_logical = 0; + uint64_t last_ext_len = 0; + uint64_t last_read_size = 0; + unsigned int i = 0; + + do + { + fiemap->fm_start = 0ULL; + fiemap->fm_length = FIEMAP_MAX_OFFSET; + fiemap->fm_extent_count = count; + + /* When ioctl(2) fails, fall back to the normal copy only if it + is the first time we met. */ + if (ioctl (src_fd, FS_IOC_FIEMAP, (unsigned long) fiemap) < 0) + { + /* If `i > 0', then at least one ioctl(2) has been performed before. */ + if (i == 0) + *normal_copy_required = true; + return false; + } + + /* If 0 extents are returned, then more ioctls are not needed. */ + if (fiemap->fm_mapped_extents == 0) + break; + + for (i = 0; i < fiemap->fm_mapped_extents; i++) + { + assert (fm_ext[i].fe_logical <= OFF_T_MAX); + + off_t ext_logical = fm_ext[i].fe_logical; + uint64_t ext_len = fm_ext[i].fe_length; + + if (lseek (src_fd, ext_logical, SEEK_SET) < 0LL) + { + error (0, errno, _("cannot lseek %s"), quote (src_name)); + return fail; + } + + if (lseek (dest_fd, ext_logical, SEEK_SET) < 0LL) + { + error (0, errno, _("cannot lseek %s"), quote (dst_name)); + return fail; + } + + if (fm_ext[i].fe_flags & FIEMAP_EXTENT_LAST) + { + last_ext_logical = ext_logical; + last_ext_len = ext_len; + last = true; + } + + while (0 < ext_len) + { + char buf[buf_size]; + + /* Avoid reading into the holes if the left extent + length is shorter than the buffer size. */ + if (ext_len < buf_size) + buf_size = ext_len; + + ssize_t n_read = read (src_fd, buf, buf_size); + if (n_read < 0) + { +#ifdef EINTR + if (errno == EINTR) + continue; +#endif + error (0, errno, _("reading %s"), quote (src_name)); + return fail; + } + + if (n_read == 0) + { + /* Figure out how many bytes read from the last extent. */ + last_read_size = last_ext_len - ext_len; + break; + } + + if (full_write (dest_fd, buf, n_read) != n_read) + { + error (0, errno, _("writing %s"), quote (dst_name)); + return fail; + } + + ext_len -= n_read; + } + + fiemap->fm_start = (fm_ext[i-1].fe_logical + fm_ext[i-1].fe_length); + } + } while (! last); + + /* If a file ends up with holes, the sum of the last extent logical offset + and the read-returned size will be shorter than the actual size of the + file. Use ftruncate to extend the length of the destination file. */ + if (last_ext_logical + last_read_size < src_total_size) + { + if (ftruncate (dest_fd, src_total_size) < 0) + { + error (0, errno, _("extending %s"), quote (dst_name)); + return fail; + } + } + + return ! fail; +} +#else +static bool fiemap_copy_ok (ignored) { errno == ENOTSUP; return false; } +#endif + /* FIXME: describe */ /* FIXME: rewrite this to use a hash table so we avoid the quadratic performance hit that's probably noticeable only on trees deeper @@ -679,6 +813,25 @@ copy_reg (char const *src_name, char const *dst_name, #endif } + if (make_holes) + { + bool require_normal_copy = false; + /* Perform efficient FIEMAP copy for sparse files, fall back to the + standard copy only if the ioctl(2) fails. */ + if (fiemap_copy_ok (source_desc, dest_desc, buf_size, + src_open_sb.st_size, src_name, + dst_name, &require_normal_copy)) + goto preserve_metadata; + else + { + if (! require_normal_copy) + { + return_val = false; + goto close_src_and_dst_desc; + } + } + } + /* If not making a sparse file, try to use a more-efficient buffer size. */ if (! make_holes) @@ -807,6 +960,7 @@ copy_reg (char const *src_name, char const *dst_name, } } +preserve_metadata: if (x->preserve_timestamps) { struct timespec timespec[2]; diff --git a/src/fiemap.h b/src/fiemap.h new file mode 100644 index 0000000..d33293b --- /dev/null +++ b/src/fiemap.h @@ -0,0 +1,102 @@ +/* FS_IOC_FIEMAP ioctl infrastructure. + Some portions copyright (C) 2007 Cluster File Systems, Inc + Authors: Mark Fasheh <mfas...@suse.com> + Kalpak Shah <kalpak.s...@sun.com> + Andreas Dilger <adil...@sun.com>. */ + +/* Copy from kernel, modified to respect GNU code style by Jie Liu. */ + +#ifndef _LINUX_FIEMAP_H +# define _LINUX_FIEMAP_H + +# include <linux/types.h> + +struct fiemap_extent +{ + /* Logical offset in bytes for the start of the extent + from the beginning of the file. */ + uint64_t fe_logical; + + /* Physical offset in bytes for the start of the extent + from the beginning of the disk. */ + uint64_t fe_physical; + + /* Length in bytes for this extent. */ + uint64_t fe_length; + + uint64_t fe_reserved64[2]; + + /* FIEMAP_EXTENT_* flags for this extent. */ + uint32_t fe_flags; + + uint32_t fe_reserved[3]; +}; + +struct fiemap +{ + /* Logical offset(inclusive) at which to start mapping(in). */ + uint64_t fm_start; + + /* Logical length of mapping which userspace wants(in). */ + uint64_t fm_length; + + /* FIEMAP_FLAG_* flags for request(in/out). */ + uint32_t fm_flags; + + /* Number of extents that were mapped(out). */ + uint32_t fm_mapped_extents; + + /* Size of fm_extents array(in). */ + uint32_t fm_extent_count; + + uint32_t fm_reserved; + + /* Array of mapped extents(out). */ + struct fiemap_extent fm_extents[0]; +}; + +/* The maximum offset can be mapped for a file. */ +# define FIEMAP_MAX_OFFSET (~0ULL) + +/* Sync file data before map. */ +# define FIEMAP_FLAG_SYNC 0x00000001 + +/* Map extented attribute tree. */ +# define FIEMAP_FLAG_XATTR 0x00000002 + +# define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) + +/* Last extent in file. */ +# define FIEMAP_EXTENT_LAST 0x00000001 + +/* Data location unknown. */ +# define FIEMAP_EXTENT_UNKNOWN 0x00000002 + +/* Location still pending, Sets EXTENT_UNKNOWN. */ +# define FIEMAP_EXTENT_DELALLOC 0x00000004 + +/* Data can not be read while fs is unmounted. */ +# define FIEMAP_EXTENT_ENCODED 0x00000008 + +/* Data is encrypted by fs. Sets EXTENT_NO_BYPASS. */ +# define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 + +/* Extent offsets may not be block aligned. */ +# define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 + +/* Data mixed with metadata. Sets EXTENT_NOT_ALIGNED. */ +# define FIEMAP_EXTENT_DATA_INLINE 0x00000200 + +/* Multiple files in block. Set EXTENT_NOT_ALIGNED. */ +# define FIEMAP_EXTENT_DATA_TAIL 0x00000400 + +/* Space allocated, but not data (i.e. zero). */ +# define FIEMAP_EXTENT_UNWRITTEN 0x00000800 + +/* File does not natively support extents. Result merged for efficiency. */ +# define FIEMAP_EXTENT_MERGED 0x00001000 + +/* Space shared with other files. */ +# define FIEMAP_EXTENT_SHARED 0x00002000 + +#endif -- 1.5.4.3 >From f18e1801d1dfca9fa278572b8172a5f97da2adc1 Mon Sep 17 00:00:00 2001 From: Jie Liu <jeff....@oracle.com> Date: Thu, 13 May 2010 22:17:53 +0800 Subject: [PATCH 1/1] tests: add a new test for FIEMAP-copy * tests/cp/sparse-fiemap: Add a new test for FIEMAP-copy against a loopbacked ext4 partition. * tests/Makefile.am (sparse-fiemap): Reference the new test. Signed-off-by: Jie Liu <jeff....@oracle.com> --- tests/Makefile.am | 2 + tests/cp/sparse-fiemap | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 0 deletions(-) create mode 100644 tests/cp/sparse-fiemap diff --git a/tests/Makefile.am b/tests/Makefile.am index 46d388a..a76c6a7 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -25,6 +25,7 @@ root_tests = \ cp/special-bits \ cp/cp-mv-enotsup-xattr \ cp/capability \ + cp/sparse-fiemap \ dd/skip-seek-past-dev \ install/install-C-root \ ls/capability \ @@ -319,6 +320,7 @@ TESTS = \ cp/same-file \ cp/slink-2-slink \ cp/sparse \ + cp/sparse-fiemap \ cp/special-f \ cp/src-base-dot \ cp/symlink-slash \ diff --git a/tests/cp/sparse-fiemap b/tests/cp/sparse-fiemap new file mode 100644 index 0000000..f9d3a94 --- /dev/null +++ b/tests/cp/sparse-fiemap @@ -0,0 +1,61 @@ +#!/bin/sh +# Test cp --sparse=always through fiemap copy + +# Copyright (C) 2006-2010 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +if test "$VERBOSE" = yes; then + set -x + cp --version +fi + +. $srcdir/test-lib.sh +require_root_ + +cwd=`pwd` +cleanup_() { cd /; umount "$cwd/mnt"; } + +# Create an ext4 loopback file system +dd if=/dev/zero of=blob bs=8192 count=1000 > /dev/null 2>&1 \ + || skip=1 +mkdir mnt +mkfs -t ext4 -F blob || + skip_test_ "failed to create ext4 file system" +mount -oloop blob mnt || skip=1 +echo test > mnt/f || skip=1 +test -s mnt/f || skip=1 + +test $skip = 1 && + skip_test_ "insufficient mount/ext4 support" + +rm -f mnt/f + +# Create a 2gb sparse file +dd if=/dev/zero of=mnt/sparse bs=1k count=1 seek=2096128 > /dev/null 2>&1 || framework_failure + +# It take more than 20 seconds to transfer the created sparse file +# through normal copy, by contrast, it take even less than 1 second +# through FIEMAP-copy. +timeout 1 cp --sparse=always mnt/sparse mnt/sparse_fiemap || fail=1 +test $? = 124 && fail=1 + +# Ensure that the sparse file copied through fiemap has the same size +# in bytes as the original. +test `stat --printf %s $sparse` = `stat --printf %s $fiemap` || fail=1 + +rm -f mnt/sparse +rm -f mnt/sparse_fiemap + +Exit $fail -- 1.5.4.3 Thanks, -Jeff > > Then I remembered that here we have timeout(1), so: > you may ignore the above and consider this a suggestion > to use timeout: > > But that was in Parted, where we can't guarantee that the timeout > program is available. Here in coreutils, you're guaranteed to > have timeout(1) (just built), so you might want to use it, too: > Contrive a test that takes a very long time without FIEMAP support > yet that runs in a couple seconds with it. Then run cp via timeout > with a 10-second limit. If timeout's exit status is not 0, > then make the test fail. > > That has the advantage of letting you use an example that would take > far longer that we typically want to wait for a non-FIEMAP test. > I.e., perform only the FIEMAP-copy and ensure that it's "quick enough". > You don't have to perform a non-FIEMAP one. > > Another advantage: if you don't do the old/slow sparse copy, > there's no need for comparison (and bc or awk) at all. > > > -- With Windows 7, Microsoft is asserting legal control over your computer and is using this power to abuse computer users.