Sure, happy to help. I did not see mknod+setxattr in the strace output. Included is a trimmed version of the strace output, along with a few more bits of information. Thanks!
# cat /proc/fs/lustre/version lustre: 2.7.19.8 # cat /etc/redhat-release CentOS Linux release 7.3.1611 (Core) # uname -r 3.10.0-514.2.2.el7_lustre.x86_64 # rpm -qa|grep tar tar-1.26-31.el7.x86_64 # sha1sum `which tar` `which gtar` ea17ec98894212b2e2285eb2dd99aad76185ea7d /usr/bin/tar ea17ec98894212b2e2285eb2dd99aad76185ea7d /usr/bin/gtar Striping was set on the four directories before creating the files. mkdir -p /scratch/1; lfs setstripe -c 1 --stripe-size 128K /scratch/1; lfs getstripe /scratch/1 mkdir -p /scratch/2; lfs setstripe -c 2 --stripe-size 256K /scratch/2; lfs getstripe /scratch/2 mkdir -p /scratch/3; lfs setstripe -c 3 --stripe-size 768K /scratch/3; lfs getstripe /scratch/3 mkdir -p /scratch/4; lfs setstripe -c 4 --stripe-size 1M /scratch/4; lfs getstripe /scratch/4 After tar, all files and directories all had the default Lustre striping. # tar ztvf /scratch.tgz drwxr-xr-x root/root 0 2017-03-19 10:54 scratch/ drwxr-xr-x root/root 0 2017-03-19 10:57 scratch/4/ -rw-r--r-- root/root 4194304 2017-03-19 10:57 scratch/4/4.dd drwxr-xr-x root/root 0 2017-03-19 10:57 scratch/3/ -rw-r--r-- root/root 4194304 2017-03-19 10:57 scratch/3/3.dd drwxr-xr-x root/root 0 2017-03-19 10:57 scratch/1/ -rw-r--r-- root/root 4194304 2017-03-19 10:57 scratch/1/1.dd drwxr-xr-x root/root 0 2017-03-19 10:57 scratch/2/ -rw-r--r-- root/root 4194304 2017-03-19 10:57 scratch/2/2.dd # strace tar zxvf /scratch.tgz > strace.out 2>&1 execve("/usr/bin/tar", ["tar", "zxvf", "/scratch.tgz"], [/* 22 vars */]) = 0 ... (-cut - loading libraries) ... fstat(1, {st_mode=S_IFREG|0644, st_size=10187, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f4a63d9f000 write(1, "scratch/\n", 9scratch/ ) = 9 mkdirat(AT_FDCWD, "scratch", 0700) = -1 EEXIST (File exists) newfstatat(AT_FDCWD, "scratch", {st_mode=S_IFDIR|0755, st_size=4096, ...}, AT_SYMLINK_NOFOLLOW) = 0 write(1, "scratch/4/\n", 11scratch/4/ ) = 11 mkdirat(AT_FDCWD, "scratch/4", 0700) = 0 write(1, "scratch/4/4.dd\n", 15scratch/4/4.dd ) = 15 openat(AT_FDCWD, "scratch/4/4.dd", read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 O_WRONLY|O_CREAT|O_EXCL|O_NOCTTY|O_NONBLOCK|O_CLOEXEC, 0600) = 4 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 5632) = 5632 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 5632) = 5632 ... (-cut) ... write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512 dup2(4, 4) = 4 fstat(4, {st_mode=S_IFREG|0600, st_size=4194304, ...}) = 0 utimensat(4, NULL, {{1489935825, 0}, {1489935444, 0}}, 0) = 0 fchown(4, 0, 0) = 0 fchmod(4, 0644) = 0 close(4) = 0 write(1, "scratch/3/\n", 11scratch/3/ ) = 11 newfstatat(AT_FDCWD, "scratch/4", {st_mode=S_IFDIR|0700, st_size=4096, ...}, AT_SYMLINK_NOFOLLOW) = 0 utimensat(AT_FDCWD, "scratch/4", {{1489935825, 0}, {1489935444, 0}}, AT_SYMLINK_NOFOLLOW) = 0 fchownat(AT_FDCWD, "scratch/4", 0, 0, AT_SYMLINK_NOFOLLOW) = 0 fchmodat(AT_FDCWD, "scratch/4", 0755) = 0 mkdirat(AT_FDCWD, "scratch/3", 0700) = 0 write(1, "scratch/3/3.dd\n", 15scratch/3/3.dd ) = 15 openat(AT_FDCWD, "scratch/3/3.dd", O_WRONLY|O_CREAT|O_EXCL|O_NOCTTY|O_NONBLOCK|O_CLOEXEC, 0600) = 4 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 6656) = 6656 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 ... (-cut - pick up with last file...) ... d(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=2476, si_status=0, si_utime=7, si_stime=0} --- write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 10240) = 10240 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 7680) = 7680 dup2(4, 4) = 4 fstat(4, {st_mode=S_IFREG|0600, st_size=4194304, ...}) = 0 utimensat(4, NULL, {{1489935825, 0}, {1489935432, 0}}, 0) = 0 fchown(4, 0, 0) = 0 fchmod(4, 0644) = 0 close(4) = 0 clock_gettime(CLOCK_REALTIME, {1489935825, 628399394}) = 0 clock_gettime(CLOCK_REALTIME, {1489935825, 628414336}) = 0 close(3) = 0 wait4(2476, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 2476 newfstatat(AT_FDCWD, "scratch/2", {st_mode=S_IFDIR|0700, st_size=4096, ...}, AT_SYMLINK_NOFOLLOW) = 0 utimensat(AT_FDCWD, "scratch/2", {{1489935825, 0}, {1489935432, 0}}, AT_SYMLINK_NOFOLLOW) = 0 fchownat(AT_FDCWD, "scratch/2", 0, 0, AT_SYMLINK_NOFOLLOW) = 0 fchmodat(AT_FDCWD, "scratch/2", 0755) = 0 newfstatat(AT_FDCWD, "scratch", {st_mode=S_IFDIR|0755, st_size=4096, ...}, 0) = 0 utimensat(AT_FDCWD, "scratch", {{1489934977, 0}, {1489935261, 0}}, 0) = 0 fchownat(AT_FDCWD, "scratch", 0, 0, 0) = 0 close(1) = 0 munmap(0x7f4a63d9f000, 4096) = 0 close(2) = 0 exit_group(0) = ? +++ exited with 0 +++ Brett -- Protect Yourself Against Cybercrime PDS Software Solutions LLC https://www.TrustPDS.com <https://www.trustpds.com/> On Sun, Mar 19, 2017 at 7:39 AM, Dilger, Andreas <andreas.dil...@intel.com> wrote: > I ran a test locally with RHEL 6.8 and the included tar 1.26 using strace, > and tar is properly using mknod+setxattr to restore the "lov" xattr, and > the stripe count and stripe size to be preserved. > > The OST index is not preserved with the xattr restore, since that may > cause imbalance if the files were backed up in a different filesystem > (e.g. one with fewer OSTs). The MDS will balance OST allocation as needed > for the current OST usage. > > Could you please run your tar on RHEL 7 with strace to see if it is doing > this correctly? > > Cheers, Andreas > > On Mar 18, 2017, at 21:51, Brett Lee <brettlee.lus...@gmail.com> wrote: > > Hi Andreas, I expected that to be the case, but found out it was not. > Instead, the restore restores everything - unless directed otherwise. > > Backup == cmd + add xattrs. > Restore == cmd + exclude xattrs. > > Brett > -- > Protect Yourself Against Cybercrime > PDS Software Solutions LLC > https://www.TrustPDS.com > On Mar 18, 2017 9:28 PM, "Dilger, Andreas" <andreas.dil...@intel.com> > wrote: > >> Do you need to specify --xattrs (or similar) during the restore phase as >> well? >> >> Cheers, Andreas >> >> On Mar 17, 2017, at 15:12, Brett Lee <brettlee.lus...@gmail.com> wrote: >> >> Hi. In what I thought was a valid test, I was unable to confirm that a >> backup and restore retained the layouts. Perhaps my expectation or process >> was incorrect? The process was: >> >> 1. Create 4 files, each with different stripe sizes and stripe counts >> (verified with getstripe). >> 2. Back up the files using tar-1.26-31.el7.x86_64. >> 3. Recreate a file system and restore the files. >> >> Backup command: tar --xattrs -zcvf /scratch.tgz /scratch >> Restore command: tar zxvf /scratch.tgz >> >> After restoration, getstripe showed that each file had the default stripe >> count (1) and stripe size (1MB). >> FWIW: After restoring, getfattr produced the same result for each file: >> # getfattr -d -m - -R <file> >> lustre.lov=0s0AvRCwEAAAAdAAAAAAAAAAAEAAACAAAAAAAQAAEAAAAFAAA >> AAAAAAAAAAAAAAAAAAAAAAAAAAAA= >> trusted.link=0s3/HqEQEAAAAuAAAAAAAAAAAAAAAAAAAAABYAAAACAAAEA >> AAAAAUAAAAAMS5kZA== >> trusted.lma=0sAAAAAAAAAAAABAAAAgAAAB0AAAAAAAAA >> trusted.lov=0s0AvRCwEAAAAdAAAAAAAAAAAEAAACAAAAAAAQAAEAAAAFAA >> AAAAAAAAAAAAAAAAAAAAAAAAAAAAA= >> >> Brett >> -- >> Protect Yourself Against Cybercrime >> PDS Software Solutions LLC >> https://www.TrustPDS.com <https://www.trustpds.com/> >> >> On Wed, Mar 15, 2017 at 5:03 AM, Dilger, Andreas < >> andreas.dil...@intel.com> wrote: >> >>> I believe Zmanda is already using GNU tar (or RHEL tar) for the actual >>> backup storage? I that case it should already work, since we fixed tar >>> long ago to backup and restore xattrs in a way that preserves Lustre >>> layouts. >>> >>> Cheers, Andreas >>> >>> On Mar 14, 2017, at 15:47, Brett Lee <brettlee.lus...@gmail.com> wrote: >>> >>> Thanks for the details, Andreas! >>> >>> Maybe OpenSFS can fund Zmanda so that their backup software can include >>> the Lustre metadata... :) >>> >>> Brett >>> -- >>> Protect Yourself Against Cybercrime >>> PDS Software Solutions LLC >>> https://www.TrustPDS.com <https://www.trustpds.com/> >>> >>> On Tue, Mar 14, 2017 at 3:13 PM, Dilger, Andreas < >>> andreas.dil...@intel.com> wrote: >>> >>>> To reply to this old thread, there are two different kinds of Lustre >>>> backup solutions: >>>> - file level backups that traverse the client POSIX filesystem, for >>>> which any number of >>>> commercial solutions exist. Making these solutions "capable of >>>> saving Lustre metadata" >>>> boils down to two simple things - save the "lustre.lov" xattr during >>>> backup (at a minimum, >>>> other xattrs also should be backed up), and then using mknod(2) + >>>> setxattr() to restore >>>> the "lustre.lov" xattr before opening the file and restoring the data. >>>> >>>> - device level backups (e.g. "dd" for ldiskfs, and "zfs send/recv" for >>>> ZFS). >>>> >>>> Using the file level backups allows backup/restore of subsets of the >>>> filesystem, since many >>>> HPC sites have Lustre filesystems that are too large to backup >>>> completely. I typically do >>>> not recommend to use device-level backups for the OSTs, unless doing an >>>> OST hardware migration, >>>> and even then it is probably less disruptive to do Lustre-level file >>>> migration off the OST >>>> before swapping it out. >>>> >>>> Whether file level backups are used or not, I would recommend sites >>>> always make periodic >>>> device level backups of the MDT(s). The amount of space needed for an >>>> MDT backup is small >>>> compared to the rest of the filesystem (e.g. a few TB at most), and can >>>> avoid the need for >>>> a full filesystem restore (e.g. multi-PB of data, if a full backup >>>> exists at all) even >>>> though all the data is still available on the OSTs. >>>> >>>> The MDT device-level backup can use relatively slow SATA drives, since >>>> they will mostly be >>>> used for linear writes (or occasionally linear reads for restore), so a >>>> few multi-TB SATA III >>>> drives is sufficient for storing a rotating set of MDT device backups. >>>> At 150MB/s for even >>>> a single SATA drive, this is about 2h/TB, which is reasonable to do >>>> once a week (or more often >>>> for smaller MDTs). >>>> >>>> While using an LVM snapshot of the ldiskfs MDT for the backup source is >>>> desirable for consistency >>>> reasons, having even an MDT backup from a mounted and in-use MDT is >>>> better than nothing at >>>> all when a problem is hit, since e2fsck can repair the in-use >>>> inconsistencies fairly easily, >>>> and Lustre can deal with inconsistencies between the MDT and OST >>>> reasonably (at most returning >>>> an -ENOENT error to the client for files that were deleted). >>>> >>>> Cheers, Andreas >>>> >>>> On Feb 7, 2017, at 12:32, Andrew Holway <andrew.hol...@gmail.com> >>>> wrote: >>>> > >>>> > Would it be difficult to suspend IO and snapshot all the nodes >>>> (assuming ZFS). Could you be sure that your MDS and OSS are synchronised? >>>> > >>>> > On 7 February 2017 at 19:52, Mike Selway <msel...@cray.com> wrote: >>>> >> Hello Brett, >>>> >> >>>> >> Actually, looking for someone who uses a >>>> commercialized approach (that retains user metadata and Lustre extended >>>> metadata) and not specifically the manual approaches of Chapter 17. >>>> >> >>>> >> Thanks! >>>> >> Mike >>>> >> >>>> >> Mike Selway | Sr. Tiered Storage Architect | Cray Inc. >>>> >> Work +1-301-332-4116 | msel...@cray.com >>>> >> 146 Castlemaine Ct, Castle Rock, CO 80104 | www.cray.com >>>> >> >>>> >> >>>> >>> From: Brett Lee [mailto:brettlee.lus...@gmail.com] >>>> >>> Sent: Monday, February 06, 2017 11:45 AM >>>> >>> To: Mike Selway <msel...@cray.com> >>>> >>> Cc: lustre-discuss@lists.lustre.org >>>> >>> Subject: Re: [lustre-discuss] Backup software for Lustre >>>> >>> >>>> >>> Hey Mike, >>>> >>> >>>> >>> "Chapter 17" and >>>> >>> http://www.intel.com/content/www/us/en/lustre/backup-and-res >>>> tore-training.html >>>> >>> >>>> >>> both contain methods to backup & restore the entire Lustre file >>>> system. >>>> >>> >>>> >>> Are you looking for a solution that backs up only the (user) data >>>> files and their associated metadata (e.g. xattrs)? >>>> >>> >>>> >>> Brett >>>> >>> -- >>>> >>> Protect Yourself From Cybercrime >>>> >>> PDS Software Solutions LLC >>>> >>> https://www.TrustPDS.com >>>> >>> >>>> >>>> On Mon, Feb 6, 2017 at 11:12 AM, Mike Selway <msel...@cray.com> >>>> wrote: >>>> >>>> >>>> >>>> Hello, >>>> >>>> Anyone aware of and/or using a Backup software package to >>>> protect their LFS environment (not referring to the tools/scripts suggested >>>> in Chapter 17). >>>> >>>> >>>> >>>> Regards, >>>> >>>> Mike >>>> >>>> Cheers, Andreas >>>> -- >>>> Andreas Dilger >>>> Lustre Principal Architect >>>> Intel Corporation >>>> >>>> >>>> >>>> >>>> >>>> >>>> >>>> _______________________________________________ >>>> lustre-discuss mailing list >>>> lustre-discuss@lists.lustre.org >>>> http://lists.lustre.org/listinfo.cgi/lustre-discuss-lustre.org >>>> >>> >>> >>
_______________________________________________ lustre-discuss mailing list lustre-discuss@lists.lustre.org http://lists.lustre.org/listinfo.cgi/lustre-discuss-lustre.org