Re: [RFC][PATCH] Btrfs: Fix uninitialized root flags for subvolumes
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 28.03.2011 04:01, Li Zefan wrote: > root_item->flags and root_item->byte_limit are not initialized when > a subvolume is created. This bug is not revealed until we added > readonly snapshot support - now you mount a btrfs filesystem and you > may find the subvolumes in it are readonly. > > To work around this problem, we steal a bit from root_item->inode_item->flags, > and use it to indicate if those fields have been properly initialized. > When we read a tree root from disk, we check if the bit is set, and if > not we'll set the flag and initialize the two fields of the root item. > > Reported-by: Andreas Philipp > Signed-off-by: Li Zefan Tested-by: Andreas Philipp > --- > fs/btrfs/ctree.h | 4 > fs/btrfs/disk-io.c | 4 +++- > fs/btrfs/ioctl.c | 4 > fs/btrfs/root-tree.c | 18 ++ > fs/btrfs/transaction.c | 1 + > 5 files changed, 30 insertions(+), 1 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 8b4b9d1..ff6b991 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -1284,6 +1284,8 @@ struct btrfs_root { > #define BTRFS_INODE_NOATIME (1 << 9) > #define BTRFS_INODE_DIRSYNC (1 << 10) > > +#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) > + > /* some macros to generate set/get funcs for the struct fields. This > * assumes there is a lefoo_to_cpu for every type, so lets make a simple > * one for u8: > @@ -2355,6 +2357,8 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); > int btrfs_find_orphan_roots(struct btrfs_root *tree_root); > int btrfs_set_root_node(struct btrfs_root_item *item, > struct extent_buffer *node); > +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); > + > /* dir-item.c */ > int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, > struct btrfs_root *root, const char *name, > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index 3e1ea3e..4f8dafc 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -1184,8 +1184,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, > root->commit_root = btrfs_root_node(root); > BUG_ON(!root->node); > out: > - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) > + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { > root->ref_cows = 1; > + btrfs_check_and_init_root_item(&root->root_item); > + } > > return root; > } > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c > index 5fdb2ab..2ff51e6 100644 > --- a/fs/btrfs/ioctl.c > +++ b/fs/btrfs/ioctl.c > @@ -294,6 +294,10 @@ static noinline int create_subvol(struct btrfs_root *root, > inode_item->nbytes = cpu_to_le64(root->leafsize); > inode_item->mode = cpu_to_le32(S_IFDIR | 0755); > > + root_item.flags = 0; > + root_item.byte_limit = 0; > + inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT); > + > btrfs_set_root_bytenr(&root_item, leaf->start); > btrfs_set_root_generation(&root_item, trans->transid); > btrfs_set_root_level(&root_item, 0); > diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c > index 6a1086e..3e45c32 100644 > --- a/fs/btrfs/root-tree.c > +++ b/fs/btrfs/root-tree.c > @@ -471,3 +471,21 @@ again: > btrfs_free_path(path); > return 0; > } > + > +/* > + * Old btrfs forgets to init root_item->flags and root_item->byte_limit > + * for subvolumes. To work around this problem, we steal a bit from > + * root_item->inode_item->flags, and use it to indicate if those fields > + * have been properly initialized. > + */ > +void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) > +{ > + u64 inode_flags = le64_to_cpu(root_item->inode.flags); > + > + if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) { > + inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT; > + root_item->inode.flags = cpu_to_le64(inode_flags); > + root_item->flags = 0; > + root_item->byte_limit = 0; > + } > +} > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > index 3d73c8d..f3d6681 100644 > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -970,6 +970,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, > record_root_in_trans(trans, root); > btrfs_set_root_last_snapshot(&root->root_item, trans->transid); > memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); > + btrfs_check_and_init_root_item(new_root_item); > > root_flags = btrfs_root_flags(new_root_item); > if (pending->readonly) -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.9 (MingW32) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/ iQIcBAEBAgAGBQJNktS2AAoJEJIcBJ3+XkgiB4gP/RqCnQQj2RcM0pxpjPqA0+GE HC5DqXGhqYa+6Y+WIKWZJIuhpxF+x5c885U7MBYU1f5UJB6bXttYVGE5Gq7pXgs5 aWezaX3WKIOsgfLWVRFezWtlmWvUuBgz3K2cLq9U+NcUNp13KjP4nI5QWwn0xo27 o4GjqdtHiN3Fen8/BCVUTHySUlFxqo2o7DvNB9KysW8Vz0zOS8Y0SXgUN+4u5p7L 7Y76UOiayMpVF6JaGQzcqGgf5WQyA8xWtYUAzuxBs+MA1Xfbogu4Oen59FKxR4BI c0Ihw2GddQfxHvGQLXXXK2BuCpZfbpTD3hdxRnAzRspk8OqpBU0DYnZv0YybsqLw XC+PM+oEUtsTAxaID0MWcoqfk40S54xv36PXgKF0RjIogJFRyxq1vNNCR5hzxVRX hjVvBc31QWyE+Ek
[PATCH] Btrfs: fix compiler warning in file.c
While compiling Btrfs, I got following messages: CC [M] fs/btrfs/file.o fs/btrfs/file.c: In function '__btrfs_buffered_write': fs/btrfs/file.c:909: warning: 'ret' may be used uninitialized in this function CC [M] fs/btrfs/tree-defrag.o This patch fixes compiler warning. Signed-off-by: Tsutomu Itoh --- fs/btrfs/file.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff -urNp linux-2.6.39-rc1/fs/btrfs/file.c linux-2.6.39-rc1.new/fs/btrfs/file.c --- linux-2.6.39-rc1/fs/btrfs/file.c2011-03-30 04:09:47.0 +0900 +++ linux-2.6.39-rc1.new/fs/btrfs/file.c2011-03-30 09:41:49.0 +0900 @@ -906,7 +906,7 @@ static noinline ssize_t __btrfs_buffered unsigned long last_index; size_t num_written = 0; int nrptrs; - int ret; + int ret = 0; nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs: return EXDEV when linking from different subvolumes
On Tue, Mar 22, 2011 at 1:20 PM, Mark Fasheh wrote: > btrfs_link returns EPERM if a cross-subvolume link is attempted. > > However, in this case I believe EXDEV to be the more appropriate value. > From the link(2) man page: This makes makes sense, that's the behavior link(2) normally returns. However, ln(1) doesn't attempt the link at all across separate filesystems - it checks fstat(... .st_dev) on the source an destination. On suvolumes, that's the same.It's a slight userspace difference and I'm not sure it matters. However, what stops hardlinks across subvolumes? I understand that when you delete a subvolume you'll have to recurse through to decrease the link count and possibly drop the inode, but Isn't there a scan going on already for the CoW blocks created by snapshot? What's the difference between a hardlink made across subvolumes and a hardlink cloned when a snapshot is made? I ask because I hit that when sorting 10 years of downloads/camera offloads/documents etc and I wanted to not duplicate the enormous amount of storage into the new volume. I ended up just biting the bullet and duplicating 250gb - I've got an unfortunate "filesystem as it was" backup subvolume which will retain the blocks even if the current /home is cleaned up. As a workaround I could find huge files and delete them on the backup, but it would seem that using link(2) to tell btrfs that these blocks will be the same makes it trivial to do from userspace. Addendum: BTRFS subvolumes: /sorted /home-as-it-was /home (pruned lost but valuable files) (daily snapshots of /sorted and /home) home-as-it-was was made by using rsync on /home to /home-as-it-was, /home was a snapshot, and /sorted was created as it's own subvolume. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: How to remove a device on a RAID-1 before replacing it?
On Tue, Mar 29, 2011 at 05:01:39PM -0400, Andrew Lutomirski wrote: > On Tue, Mar 29, 2011 at 4:21 PM, cwillu wrote: > > On Tue, Mar 29, 2011 at 2:09 PM, Andrew Lutomirski wrote: > >> I have a disk with a SMART failure. It still works but I assume it'll > >> fail sooner or later. > >> > >> I want to remove it from my btrfs volume, replace it, and add the new > >> one. But the obvious command doesn't work: > >> > >> # btrfs device delete /dev/dm-5 /mnt/foo > >> ERROR: error removing the device '/dev/dm-5' > >> > >> dmesg says: > >> btrfs: unable to go below two devices on raid1 > >> > >> With mdadm, I would fail the device, remove it, run degraded until I > >> get a new device, and hot-add that device. > >> > >> With btrfs, I'd like some confirmation from the fs that data is > >> balanced appropriately so I won't get data loss if I just yank the > >> drive. And I don't even know how to tell btrfs to release the drive > >> so I can safely remove it. > >> > >> (Mounting with -o degraded doesn't help. I could umount, remove the > >> disk, then remount, but that feels like a hack.) > > > > There's no "nice" way to remove a failing disk in btrfs right now > > ("btrfs dev delete" is more of a online management thing to politely > > remove a perfectly functional disk you'd like to use for something > > else.) As I understand things, the only way to do it right now is the > > umount, remove disk, remount w/ degraded, and then btrfs add the new > > device. > > > > Well, the disk *is* perfectly functional. It just won't be for long. > > I guess what I'm saying is that either btrfs dev delete isn't really > working -- I want to be able to convert to non-RAID and back or > degraged and back or something else equivalent. RAID conversion isn't quite ready yet, sadly. As I understand it, you've got two options: - Yoink the drive (thus making the fs run in degraded mode), add the new one, and balance to spread the duplicate data onto the new volume. - Add the new drive to the FS first, then use btrfs dev del to remove the original device. This should end up writing all the replicated data to the new drive as it "removes" the data from the old one. Of the two options, the latter is (for me) the favourite, as you don't end up with a filesystem that's running on just a single copy of the data. Hugo. -- === Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk === PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk --- Prof Brain had been in search of The Truth for 25 years, with --- the intention of putting it under house arrest. signature.asc Description: Digital signature
Re: How to remove a device on a RAID-1 before replacing it?
On Tue, Mar 29, 2011 at 4:21 PM, cwillu wrote: > On Tue, Mar 29, 2011 at 2:09 PM, Andrew Lutomirski wrote: >> I have a disk with a SMART failure. It still works but I assume it'll >> fail sooner or later. >> >> I want to remove it from my btrfs volume, replace it, and add the new >> one. But the obvious command doesn't work: >> >> # btrfs device delete /dev/dm-5 /mnt/foo >> ERROR: error removing the device '/dev/dm-5' >> >> dmesg says: >> btrfs: unable to go below two devices on raid1 >> >> With mdadm, I would fail the device, remove it, run degraded until I >> get a new device, and hot-add that device. >> >> With btrfs, I'd like some confirmation from the fs that data is >> balanced appropriately so I won't get data loss if I just yank the >> drive. And I don't even know how to tell btrfs to release the drive >> so I can safely remove it. >> >> (Mounting with -o degraded doesn't help. I could umount, remove the >> disk, then remount, but that feels like a hack.) > > There's no "nice" way to remove a failing disk in btrfs right now > ("btrfs dev delete" is more of a online management thing to politely > remove a perfectly functional disk you'd like to use for something > else.) As I understand things, the only way to do it right now is the > umount, remove disk, remount w/ degraded, and then btrfs add the new > device. > Well, the disk *is* perfectly functional. It just won't be for long. I guess what I'm saying is that either btrfs dev delete isn't really working -- I want to be able to convert to non-RAID and back or degraged and back or something else equivalent. --Andy -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: How to remove a device on a RAID-1 before replacing it?
Hallo, cwillu, Du meintest am 29.03.11: >> I have a disk with a SMART failure. It still works but I assume >> it'll fail sooner or later. >> >> I want to remove it from my btrfs volume, replace it, and add the >> new one. But the obvious command doesn't work: [...] > There's no "nice" way to remove a failing disk in btrfs right now > ("btrfs dev delete" is more of a online management thing to politely > remove a perfectly functional disk you'd like to use for something > else.) Nice hope, but even "btrfs device delete /dev/sdxn /mnt/btr" doesn't work well. I've tried it, with kernel 2.6.38.1 ... # mkfs.btrfs -L SCSI -m raid0 /dev/sda1 # mount LABEL=SCSI /mnt/btr # btrfs filesystem show Label: 'SCSI' uuid: 4d834705-5a65-4d1f-a8a0-a5ea9348db50 Total devices 1 FS bytes used 28.00KB devid1 size 4.04GB used 20.00MB path /dev/sda1 Btrfs Btrfs v0.19 # btrfs filesystem df /mnt/btr Data: total=8.00MB, used=0.00 System: total=4.00MB, used=8.00KB Metadata: total=8.00MB, used=20.00KB # df -t btrfs FilesystemType 1K-blocks Used Available Use% Mounted on /dev/sda1btrfs 423255628 4220224 1% /mnt/btr # gdisk -l /dev/sda 12048 8467166 4.0 GiB 0700 Linux/Windows data # # btrfs device add /dev/sdc1 /mnt/btr # btrfs filesystem show Label: 'SCSI' uuid: 4d834705-5a65-4d1f-a8a0-a5ea9348db50 Total devices 2 FS bytes used 28.00KB devid1 size 4.04GB used 433.31MB path /dev/sda1 devid2 size 8.54GB used 0.00 path /dev/sdc1 Btrfs Btrfs v0.19 # btrfs filesystem df /mnt/btr Data: total=421.31MB, used=0.00 System: total=4.00MB, used=4.00KB Metadata: total=8.00MB, used=24.00KB # df -t btrfs FilesystemType 1K-blocks Used Available Use% Mounted on /dev/sda1btrfs1318963228 13176256 1% /mnt/btr # gdisk -l /dev/sda 12048 8467166 4.0 GiB 0700 Linux/Windows data /dev/sdc 1204817916206 8.5 GiB 0700 Linux/Windows data # # 2 GByte kopiert # btrfs filesystem show Label: 'SCSI' uuid: 4d834705-5a65-4d1f-a8a0-a5ea9348db50 Total devices 2 FS bytes used 2.10GB devid1 size 4.04GB used 433.31MB path /dev/sda1 devid2 size 8.54GB used 2.25GB path /dev/sdc1 Btrfs Btrfs v0.19 # btrfs filesystem df /mnt/btr Data: total=2.41GB, used=2.10GB System: total=4.00MB, used=4.00KB Metadata: total=264.00MB, used=2.79MB # df -t btrfs FilesystemType 1K-blocks Used Available Use% Mounted on /dev/sda1btrfs13189632 2202740 10714232 18% /mnt/btr # gdisk -l /dev/sda 12048 8467166 4.0 GiB 0700 Linux/Windows data /dev/sdc 1204817916206 8.5 GiB 0700 Linux/Windows data # # 7 GByte kopiert # btrfs filesystem show Label: 'SCSI' uuid: 4d834705-5a65-4d1f-a8a0-a5ea9348db50 Total devices 2 FS bytes used 9.03GB devid1 size 4.04GB used 1.42GB path /dev/sda1 devid2 size 8.54GB used 8.25GB path /dev/sdc1 Btrfs Btrfs v0.19 # btrfs filesystem df /mnt/btr Data: total=9.41GB, used=9.02GB System: total=4.00MB, used=4.00KB Metadata: total=264.00MB, used=11.80MB # df -t btrfs FilesystemType 1K-blocks Used Available Use% Mounted on /dev/sda1btrfs13189632 9467164 3459032 74% /mnt/btr # gdisk -l /dev/sda 12048 8467166 4.0 GiB 0700 Linux/Windows data /dev/sdc 1204817916206 8.5 GiB 0700 Linux/Windows data # # btrfs device add /dev/sdb1 /mnt/btr # btrfs filesystem show Label: 'SCSI' uuid: 4d834705-5a65-4d1f-a8a0-a5ea9348db50 Total devices 3 FS bytes used 9.12GB devid3 size 136.73GB used 0.00 path /dev/sdb1 devid1 size 4.04GB used 1.42GB path /dev/sda1 devid2 size 8.54GB used 8.25GB path /dev/sdc1 Btrfs Btrfs v0.19 # btrfs filesystem df /mnt/btr Data: total=9.41GB, used=9.10GB System: total=4.00MB, used=4.00KB Metadata: total=264.00MB, used=11.91MB # df -t btrfs FilesystemType 1K-blocks Used Available Use% Mounted on /dev/sda1btrfs 156562592 9559000 146739216 7% /mnt/btr # gdisk -l /dev/sda 12048 8467166 4.0 GiB 0700 Linux/Windows data /dev/sdc 1204817916206 8.5 GiB 0700 Linux/Windows data /dev/sdb 12048 286747966 136.7 GiB 0700 Linux/Windows data # # btrfs filesystem balance /mnt/btr # btrfs filesystem show Label: 'SCSI' uuid: 4d834705-5a65-4d1f-a8a0-a5ea9348db50 Total devices 3 FS bytes used 9.11GB devid3 size 136.73GB used 10.61GB path /dev/sdb1 devid1 size 4
Re: How to remove a device on a RAID-1 before replacing it?
On Tue, Mar 29, 2011 at 2:09 PM, Andrew Lutomirski wrote: > I have a disk with a SMART failure. It still works but I assume it'll > fail sooner or later. > > I want to remove it from my btrfs volume, replace it, and add the new > one. But the obvious command doesn't work: > > # btrfs device delete /dev/dm-5 /mnt/foo > ERROR: error removing the device '/dev/dm-5' > > dmesg says: > btrfs: unable to go below two devices on raid1 > > With mdadm, I would fail the device, remove it, run degraded until I > get a new device, and hot-add that device. > > With btrfs, I'd like some confirmation from the fs that data is > balanced appropriately so I won't get data loss if I just yank the > drive. And I don't even know how to tell btrfs to release the drive > so I can safely remove it. > > (Mounting with -o degraded doesn't help. I could umount, remove the > disk, then remount, but that feels like a hack.) There's no "nice" way to remove a failing disk in btrfs right now ("btrfs dev delete" is more of a online management thing to politely remove a perfectly functional disk you'd like to use for something else.) As I understand things, the only way to do it right now is the umount, remove disk, remount w/ degraded, and then btrfs add the new device. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
How to remove a device on a RAID-1 before replacing it?
I have a disk with a SMART failure. It still works but I assume it'll fail sooner or later. I want to remove it from my btrfs volume, replace it, and add the new one. But the obvious command doesn't work: # btrfs device delete /dev/dm-5 /mnt/foo ERROR: error removing the device '/dev/dm-5' dmesg says: btrfs: unable to go below two devices on raid1 With mdadm, I would fail the device, remove it, run degraded until I get a new device, and hot-add that device. With btrfs, I'd like some confirmation from the fs that data is balanced appropriately so I won't get data loss if I just yank the drive. And I don't even know how to tell btrfs to release the drive so I can safely remove it. (Mounting with -o degraded doesn't help. I could umount, remove the disk, then remount, but that feels like a hack.) This is 2.6.38.1 running Fedora 14's version of btrfs-progs, but btrfs-progs-unstable git does the same thing, as does btrfs-vol -r. --Andy -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] mutex: Apply adaptive spinning on mutex_trylock()
On Tue, 2011-03-29 at 19:09 +0200, Tejun Heo wrote: > Here's the combined patch I was planning on testing but didn't get to > (yet). It implements two things - hard limit on spin duration and > early break if the owner also is spinning on a mutex. This is going to give massive conflicts with https://lkml.org/lkml/2011/3/2/286 https://lkml.org/lkml/2011/3/2/282 which I was planning to stuff into .40 > @@ -4021,16 +4025,44 @@ EXPORT_SYMBOL(schedule); > > #ifdef CONFIG_MUTEX_SPIN_ON_OWNER > /* > + * Maximum mutex owner spin duration in nsecs. Don't spin more then > + * DEF_TIMESLICE. > + */ > +#define MAX_MUTEX_SPIN_NS(DEF_TIMESLICE * 10LLU / HZ) DEF_TIMESLICE is SCHED_RR only, so its use here is dubious at best, also I bet we have something like NSEC_PER_SEC to avoid counting '0's. > + > +/** > + * mutex_spin_on_owner - optimistic adaptive spinning on locked mutex > + * @lock: the mutex to spin on > + * @owner: the current owner (speculative pointer) > + * > + * The caller is trying to acquire @lock held by @owner. If @owner is > + * currently running, it might get unlocked soon and spinning on it can > + * save the overhead of sleeping and waking up. > + * > + * Note that @owner is completely speculative and may be completely > + * invalid. It should be accessed very carefully. > + * > + * Forward progress is guaranteed regardless of locking ordering by never > + * spinning longer than MAX_MUTEX_SPIN_NS. This is necessary because > + * mutex_trylock(), which doesn't have to follow the usual locking > + * ordering, also uses this function. While that puts a limit on things it'll still waste time. I'd much rather pass an trylock argument to mutex_spin_on_owner() and then bail on owner also spinning. > + * CONTEXT: > + * Preemption disabled. > + * > + * RETURNS: > + * %true if the lock was released and the caller should retry locking. > + * %false if the caller better go sleeping. > */ > -int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) > +bool mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) > { > @@ -4070,21 +4104,30 @@ int mutex_spin_on_owner(struct mutex *lo >* we likely have heavy contention. Return 0 to quit >* optimistic spinning and not contend further: >*/ > + ret = !lock->owner; > break; > } > > /* > - * Is that owner really running on that cpu? > + * Quit spinning if any of the followings is true. > + * > + * - The owner isn't running on that cpu. > + * - The owner also is spinning on a mutex. > + * - Someone else wants to use this cpu. > + * - We've been spinning for too long. >*/ > + if (task_thread_info(rq->curr) != owner || > + rq->spinning_on_mutex || need_resched() || > + local_clock() > start + MAX_MUTEX_SPIN_NS) { While we did our best with making local_clock() cheap, I'm still fairly uncomfortable with putting it in such a tight loop. > + ret = false; > + break; > + } > > arch_mutex_cpu_relax(); > } > > + this_rq()->spinning_on_mutex = false; > + return ret; > } > #endif > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] mutex: Apply adaptive spinning on mutex_trylock()
Here's the combined patch I was planning on testing but didn't get to (yet). It implements two things - hard limit on spin duration and early break if the owner also is spinning on a mutex. Thanks. Index: work1/include/linux/sched.h === --- work1.orig/include/linux/sched.h +++ work1/include/linux/sched.h @@ -359,7 +359,7 @@ extern signed long schedule_timeout_inte extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); +extern bool mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); struct nsproxy; struct user_namespace; Index: work1/kernel/sched.c === --- work1.orig/kernel/sched.c +++ work1/kernel/sched.c @@ -536,6 +536,10 @@ struct rq { struct hrtimer hrtick_timer; #endif +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + bool spinning_on_mutex; +#endif + #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; @@ -4021,16 +4025,44 @@ EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. + * Maximum mutex owner spin duration in nsecs. Don't spin more then + * DEF_TIMESLICE. + */ +#define MAX_MUTEX_SPIN_NS (DEF_TIMESLICE * 10LLU / HZ) + +/** + * mutex_spin_on_owner - optimistic adaptive spinning on locked mutex + * @lock: the mutex to spin on + * @owner: the current owner (speculative pointer) + * + * The caller is trying to acquire @lock held by @owner. If @owner is + * currently running, it might get unlocked soon and spinning on it can + * save the overhead of sleeping and waking up. + * + * Note that @owner is completely speculative and may be completely + * invalid. It should be accessed very carefully. + * + * Forward progress is guaranteed regardless of locking ordering by never + * spinning longer than MAX_MUTEX_SPIN_NS. This is necessary because + * mutex_trylock(), which doesn't have to follow the usual locking + * ordering, also uses this function. + * + * CONTEXT: + * Preemption disabled. + * + * RETURNS: + * %true if the lock was released and the caller should retry locking. + * %false if the caller better go sleeping. */ -int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) +bool mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) { + unsigned long start; unsigned int cpu; struct rq *rq; + bool ret = true; if (!sched_feat(OWNER_SPIN)) - return 0; + return false; #ifdef CONFIG_DEBUG_PAGEALLOC /* @@ -4039,7 +4071,7 @@ int mutex_spin_on_owner(struct mutex *lo * the mutex owner just released it and exited. */ if (probe_kernel_address(&owner->cpu, cpu)) - return 0; + return false; #else cpu = owner->cpu; #endif @@ -4049,15 +4081,17 @@ int mutex_spin_on_owner(struct mutex *lo * the cpu field may no longer be valid. */ if (cpu >= nr_cpumask_bits) - return 0; + return false; /* * We need to validate that we can do a * get_cpu() and that we have the percpu area. */ if (!cpu_online(cpu)) - return 0; + return false; + this_rq()->spinning_on_mutex = true; + start = local_clock(); rq = cpu_rq(cpu); for (;;) { @@ -4070,21 +4104,30 @@ int mutex_spin_on_owner(struct mutex *lo * we likely have heavy contention. Return 0 to quit * optimistic spinning and not contend further: */ - if (lock->owner) - return 0; + ret = !lock->owner; break; } /* -* Is that owner really running on that cpu? +* Quit spinning if any of the followings is true. +* +* - The owner isn't running on that cpu. +* - The owner also is spinning on a mutex. +* - Someone else wants to use this cpu. +* - We've been spinning for too long. */ - if (task_thread_info(rq->curr) != owner || need_resched()) - return 0; + if (task_thread_info(rq->curr) != owner || + rq->spinning_on_mutex || need_resched() || + local_clock() > start + MAX_MUTEX_SPIN_NS) { + ret = false; + break; + } arch_mutex_cpu_relax(); } - return 1; +
Re: [PATCH 2/2] mutex: Apply adaptive spinning on mutex_trylock()
Hello, guys. I've been running dbench 50 for a few days now and the result is, well, I don't know how to call it. The problem was that the original patch didn't do anything because x86 fastpath code didn't call into the generic slowpath at all. static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { if (likely(atomic_cmpxchg(count, 1, 0) == 1)) return 1; else return 0; } So, I thought that I probably was doing unconscious data selection while I was running the test before sending out the patches. Maybe I was seeing what I wanted to see, so I ran tests in larger scale more methodologically. I first started with ten consecutive runs and then doubled it with intervening reboot and then basically ended up doing that twice for four configuration (I didn't do two runs of simple and refactor but just averaged the two). The hardware is mostly the same except that I switched to a hard drive instead of SSD as hard drives tend to be slower but more consistent in performance numbers. On each run, the filesystem is recreated and the system was rebooted after every ten runs. The numbers are the reported throughput in MiB/s at the end of each run. https://spreadsheets.google.com/ccc?key=0AsbaQh2SFt66dDdxOGZWVVlIbEdIOWRQLURVVUNYSXc&hl=en Here are the descriptions of the eight columns. simpleonly with patch to make btrfs use mutex refactor mutex_spin() factored out spin mutex_spin() applied to the unused trylock slowpath spin-1ditto spin-fixedx86 trylock fastpath updated to use generic slowpath spin-fixed-1 ditto code-layout refactor + dummy function added to mutex.c code-layout-1 ditto After running the simple, refactor and spin ones, I was convinced that there definitely was something which was causing the difference. The averages were apart by more than 1.5 sigma, but I couldn't explain what could have caused such difference. The code-layout runs were my desparate attempts to find explanation on what's going on. Addition of mutex_spin to the unused trylock generic path makes gcc arrange functions differently. Without it, trylock functions end up inbetween lock and unlock funcitons; with it, they are located at the end. I commented out the unused trylock slowpath function and added a dummy function at the end to make gcc generate similar assembly layout. At this point, the only conclusions I can draw are, * Using adaptive spinning on mutex_trylock() doesn't seem to buy anything according to btrfs dbench 50 runs. and much more importantly, * btrfs dbench 50 runs are probably not good for measuring subtle mutex performance differences. Maybe it's too macro and there are larger scale tendencies which skew the result unless the number of runs are vastly increased (but 40 runs are already over eight hours). If anyone can provide an explanation on what's going on, I'll be super happy. Otherwise, for now, I'll just leave it alone. :-( Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: New btrfsck status
Chris Mason skrev 10.2.2011 14:17: Excerpts from Ben Gamari's message of 2011-02-09 21:52:20 -0500: Hey all, Over the last several months there have been many claims regarding the release of the rewritten btrfsck. Unfortunately, despite numerous claims that it will be released Real Soon Now(c), I have yet to see even a repository with preliminary code. Did I miss an announcement? There is something to be said for "release early, release often." Is there a timeline for getting btrfsck into some sort of usable form? Yes, but its still real soon now. I've been at about 90% done since Christmas. It would have been out last week but I've been chasing a debugging a very difficult corruption under load. I finally found a race in btrfs causing the corruption and now I'm back on fsck full time again. Any status updates on this ? Checking: http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs-unstable.git;a=shortlog;h=refs/heads/next I see last commit is 3+ months ago -- Thomas -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [GIT PULL] Btrfs updates for 2.6.39
Excerpts from David Sterba's message of 2011-03-28 09:25:28 -0400: > Hi, > > Chris, you did not add my sign-of on the unaligned fix I've posted, nor > the tested-by or reported-by. Looking again into the mail, it seems like > some automated-misprocessing picked the first part of the mail altough > there was properly formated patch after '--' marker (and I've verified > this is correct before sending the mail). > > Can you please fix that? > Patch reference: https://patchwork.kernel.org/patch/645671/ Sorry, looks like I messed that one up. patchwork did bring in the comments in a funny way and I didn't notice before the pull request went out. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: add initial tracepoint support for btrfs
Please ignore this patch... I just found we'd better revise the tracepoint side instead of btrfs side, will dig it more. thanks, liubo > From: Liu Bo > > [PATCH] Btrfs: fix compile warnings of btrfs tracepoint on 32bit box > > include/trace/events/btrfs.h:47:1: warning: large integer implicitly > truncated to unsigned type > include/trace/events/btrfs.h:47:1: warning: large integer implicitly > truncated to unsigned type > include/trace/events/btrfs.h:47:1: warning: large integer implicitly > truncated to unsigned type > > btrfs has defined some macros which value has ULL type, and when btrfs > tracepoints > use these macros on 32bit box, values like "-1ULL" will be truncated. > This is where those warnings come from. > > Signed-off-by: Liu Bo > --- > include/trace/events/btrfs.h | 19 +++ > 1 files changed, 11 insertions(+), 8 deletions(-) > > diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h > index f445cff..27e67fd 100644 > --- a/include/trace/events/btrfs.h > +++ b/include/trace/events/btrfs.h > @@ -36,9 +36,12 @@ struct extent_buffer; > { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, \ > { BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR" }, \ > { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, \ > - { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, \ > - { BTRFS_TREE_RELOC_OBJECTID,"TREE_RELOC"}, \ > - { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }) > + { (unsigned long)BTRFS_TREE_LOG_OBJECTID, \ > + "TREE_LOG" }, \ > + { (unsigned long)BTRFS_TREE_RELOC_OBJECTID, \ > + "TREE_RELOC"}, \ > + { (unsigned long)BTRFS_DATA_RELOC_TREE_OBJECTID,\ > + "DATA_RELOC_TREE" }) > > #define show_root_type(obj) \ > obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||\ > @@ -126,13 +129,13 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, > > #define __show_map_type(type) > \ > __print_symbolic(type, \ > - { EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \ > - { EXTENT_MAP_HOLE, "HOLE" }, \ > - { EXTENT_MAP_INLINE,"INLINE"}, \ > - { EXTENT_MAP_DELALLOC, "DELALLOC" }) > + { (unsigned long)EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \ > + { (unsigned long)EXTENT_MAP_HOLE, "HOLE" }, \ > + { (unsigned long)EXTENT_MAP_INLINE, "INLINE"}, \ > + { (unsigned long)EXTENT_MAP_DELALLOC, "DELALLOC" }) > > #define show_map_type(type) \ > - type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type) > + type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type) > > #define show_map_flags(flag) \ > __print_flags(flag, "|",\ -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: add initial tracepoint support for btrfs
On 03/29/2011 09:16 AM, liubo wrote: > On 03/28/2011 08:59 AM, Chris Mason wrote: >> Excerpts from Chris Mason's message of 2011-03-26 08:12:04 -0400: >>> Excerpts from liubo's message of 2011-03-24 07:18:59 -0400: Tracepoints can provide insight into why btrfs hits bugs and be greatly helpful for debugging, e.g >>> This is really neat, I've queued it up. >> Whoops, it has a lot of warnings when compiled on 32 bit machines. >> Please take a look: >> >> include/trace/events/btrfs.h:47:1: warning: large integer implicitly >> truncated to unsigned type >> include/trace/events/btrfs.h:47:1: warning: large integer implicitly >> truncated to unsigned type >> include/trace/events/btrfs.h:47:1: warning: large integer implicitly >> truncated to unsigned type >> include/trace/events/btrfs.h:68:1: warning: large integer implicitly >> truncated to unsigned type >> include/trace/events/btrfs.h:68:1: warning: large integer implicitly >> truncated to unsigned type >> include/trace/events/btrfs.h:68:1: warning: large integer implicitly >> truncated to unsigned type >> include/trace/events/btrfs.h:144:1: warning: large integer implicitly >> truncated to unsigned type >> > > Ahh, I figure it out. > Will send a new version to clear warnings. > Here is the patch to clear warnings. From: Liu Bo [PATCH] Btrfs: fix compile warnings of btrfs tracepoint on 32bit box include/trace/events/btrfs.h:47:1: warning: large integer implicitly truncated to unsigned type include/trace/events/btrfs.h:47:1: warning: large integer implicitly truncated to unsigned type include/trace/events/btrfs.h:47:1: warning: large integer implicitly truncated to unsigned type btrfs has defined some macros which value has ULL type, and when btrfs tracepoints use these macros on 32bit box, values like "-1ULL" will be truncated. This is where those warnings come from. Signed-off-by: Liu Bo --- include/trace/events/btrfs.h | 19 +++ 1 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index f445cff..27e67fd 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -36,9 +36,12 @@ struct extent_buffer; { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, \ { BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR" }, \ { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, \ - { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, \ - { BTRFS_TREE_RELOC_OBJECTID,"TREE_RELOC"}, \ - { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }) + { (unsigned long)BTRFS_TREE_LOG_OBJECTID, \ + "TREE_LOG" }, \ + { (unsigned long)BTRFS_TREE_RELOC_OBJECTID, \ + "TREE_RELOC"}, \ + { (unsigned long)BTRFS_DATA_RELOC_TREE_OBJECTID,\ + "DATA_RELOC_TREE" }) #define show_root_type(obj)\ obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||\ @@ -126,13 +129,13 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, #define __show_map_type(type) \ __print_symbolic(type, \ - { EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \ - { EXTENT_MAP_HOLE, "HOLE" }, \ - { EXTENT_MAP_INLINE,"INLINE"}, \ - { EXTENT_MAP_DELALLOC, "DELALLOC" }) + { (unsigned long)EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \ + { (unsigned long)EXTENT_MAP_HOLE, "HOLE" }, \ + { (unsigned long)EXTENT_MAP_INLINE, "INLINE"}, \ + { (unsigned long)EXTENT_MAP_DELALLOC, "DELALLOC" }) #define show_map_type(type)\ - type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type) + type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type) #define show_map_flags(flag) \ __print_flags(flag, "|",\ -- 1.6.5.2 > Thanks, > liubo > >> -chris >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in >> the body of a message to majord...@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >> > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html