[PATCH v2] Btrfs: fix fsync xattr loss in the fast fsync path

2015-06-18 Thread fdmanana
From: Filipe Manana fdman...@suse.com

After commit 4f764e515361 (Btrfs: remove deleted xattrs on fsync log
replay), we can end up in a situation where during log replay we end up
deleting xattrs that were never deleted when their file was last fsynced.

This happens in the fast fsync path (flag BTRFS_INODE_NEEDS_FULL_SYNC is
not set in the inode) if the inode has the flag BTRFS_INODE_COPY_EVERYTHING
set, the xattr was added in a past transaction and the leaf where the
xattr is located was not updated (COWed or created) in the current
transaction. In this scenario the xattr item never ends up in the log
tree and therefore at log replay time, which makes the replay code delete
the xattr from the fs/subvol tree as it thinks that xattr was deleted
prior to the last fsync.

Fix this by using a new item key type that represents xattrs to be deleted
at log replay time. This key type is only used in the log tree. By using
this explicit item we can continue to log only xattrs that were added (or
modified) in the current transaction instead of all xattrs, while still
keeping the the intention of commit 4f764e515361 (Btrfs: remove deleted
xattrs on fsync log replay).

This issue is reproducible with the following test case for fstests:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo QA output created by $seq

  here=`pwd`
  tmp=/tmp/$$
  status=1  # failure is the default!

  _cleanup()
  {
  _cleanup_flakey
  rm -f $tmp.*
  }
  trap _cleanup; exit \$status 0 1 2 3 15

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/dmflakey
  . ./common/attr

  # real QA test starts here

  # We create a lot of xattrs for a single file. Only btrfs and xfs are 
currently
  # able to store such a large mount of xattrs per file, other filesystems such
  # as ext3/4 and f2fs for example, fail with ENOSPC even if we attempt to add
  # less than 1000 xattrs with very small values.
  _supported_fs btrfs xfs
  _supported_os Linux
  _need_to_be_root
  _require_scratch
  _require_dm_flakey
  _require_attrs
  _require_metadata_journaling $SCRATCH_DEV

  rm -f $seqres.full

  _scratch_mkfs  $seqres.full 21
  _init_flakey
  _mount_flakey

  # Create the test file with some initial data and make sure everything is
  # durably persisted.
  $XFS_IO_PROG -f -c pwrite -S 0xaa 0 32k $SCRATCH_MNT/foo | _filter_xfs_io
  sync

  # Add many small xattrs to our file.
  # We create such a large amount because it's needed to trigger the issue found
  # in btrfs - we need to have an amount that causes the fs to have at least 3
  # btree leafs with xattrs stored in them, and it must work on any leaf size
  # (maximum leaf/node size is 64Kb).
  num_xattrs=2000
  for ((i = 1; i = $num_xattrs; i++)); do
  $SETFATTR_PROG -n user.attr_$(printf %04d $i) \
  -v somevalue_$(printf %04d $i) $SCRATCH_MNT/foo
  done

  # Sync the filesystem to force a commit of the current btrfs transaction, this
  # is a necessary condition to trigger the bug on btrfs.
  sync

  # Now update our file's data and fsync the file.
  # After a successful fsync, if the fsync log/journal is replayed we expect to
  # see all the xattrs we added before with the same values (and the updated 
file
  # data of course). Btrfs used to delete some of these xattrs when it replayed
  # its fsync log/journal.
  $XFS_IO_PROG -c pwrite -S 0xbb 8K 16K \
   -c fsync \
   $SCRATCH_MNT/foo | _filter_xfs_io

  echo File content after fsync and before crash:
  od -t x1 $SCRATCH_MNT/foo

  echo File xattrs after fsync and before crash:
  for ((i = 1; i = $num_xattrs; i++)); do
  $GETFATTR_PROG --absolute-names -n user.attr_$(printf %04d $i) \
  $SCRATCH_MNT/foo | _filter_scratch
  done

  # Simulate a crash/power loss.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  # Allow writes again and mount. This makes the fs replay its fsync log.
  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  echo File content after crash and log replay:
  od -t x1 $SCRATCH_MNT/foo

  echo File xattrs after crash and log replay:
  for ((i = 1; i = $num_xattrs; i++)); do
  $GETFATTR_PROG --absolute-names -n user.attr_$(printf %04d $i) \
  $SCRATCH_MNT/foo | _filter_scratch
  done

  status=0
  exit

The golden output expects all xattrs to be available, and with the correct
values, after the fsync log is replayed.

Signed-off-by: Filipe Manana fdman...@suse.com
---

V2: Updated reproducer test, no code changes.
The first version of the test actually only failed if my other
unrelated patch titled Btrfs: fix fsync data loss after append write
was applied. That's because it forced logging the inode item which
is necessary to trigger the xattr deletion replay code. So updated
the test such that it fails without that patch applied too.

 fs/btrfs/ctree.h|  15 ++
 fs/btrfs/dir-item.c |  71 ++-
 fs/btrfs/inode.c|   5 +-
 

Re: btrfs partition converted from ext4 becomes read-only minutes after booting: WARNING: CPU: 2 PID: 2777 at ../fs/btrfs/super.c:260 __btrfs_abort_transaction+0x4b/0x120

2015-06-18 Thread Robert Munteanu
On Wed, Jun 17, 2015 at 9:48 PM, Jeff Mahoney je...@suse.com wrote:
 -BEGIN PGP SIGNED MESSAGE-
 Hash: SHA1

 On 6/12/15 8:19 AM, Robert Munteanu wrote:
 Hi,

 I have converted my root ext4 partition to btrfs. I used an USB
 stick to boot and used btrfs-convert.

 I also did a balance and defrag ( in that order ) , both when the
 fs was mounted.

 After logging in to KDE I quickly get a read-only filesystem. I've
 pasted the backtrace below

 Jun 11 23:13:08 mars kernel: WARNING: CPU: 2 PID: 2777 at
 ../fs/btrfs/super.c:260 __btrfs_abort_transaction+0x4b/0x120
 [btrfs]() Jun 11 23:13:08 mars kernel: BTRFS: Transaction aborted
 (error -95) Jun 11 23:13:08 mars kernel: Modules linked in: bnep
 bluetooth rfkill fuse vboxpci(O) vboxnetadp(O) vboxnetflt(O)
 vboxdrv(O) af_packet nf_log_ipv6 xt_pkttype nf_log_ip v4
 nf_log_common xt_LOG xt_limit ip6t_REJECT xt_tcpudp
 nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_raw ipt_REJECT
 iptable_raw xt_CT iptable_filter ip6table_mangle nf_con
 ntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_ipv4
 nf_defrag_ipv4 ip_tables xt_conntrack nf_conntrack ip6table_filter
 ip6_tables x_tables xfs libcrc32c snd_hda _codec_hdmi raid1 md_mod
 gpio_ich ppdev iTCO_wdt iTCO_vendor_support coretemp
 snd_hda_codec_realtek snd_hda_codec_generic kvm_intel snd_hda_intel
 dm_mod kvm snd_hda_co ntroller snd_hda_codec snd_hwdep serio_raw
 pcspkr snd_pcm i2c_i801 snd_seq joydev snd_seq_device snd_timer snd
 8250_fintek parport_pc parport acpi_cpufreq lpc_ich Jun 11 23:13:08
 mars kernel:  soundcore mfd_core shpchp processor ata_generic btrfs
 hid_logitech_hidpp xor raid6_pq sr_mod cdrom nvidia_uvm(PO)
 nvidia(PO) firewire_ohc i firewire_core crc_itu_t uas usb_storage
 r8169 mii pata_jmicron hid_logitech_dj drm button sg Jun 11
 23:13:08 mars kernel: CPU: 2 PID: 2777 Comm: kworker/u8:0 Tainted:
 P   O4.0.4-3-desktop #1

 openSUSE Tumbleweed, I take it?

Yes. Also reported at https://bugzilla.opensuse.org/show_bug.cgi?id=934464 .


 We still actively support btrfs-convert through SLES, so we're
 invested in ensuring it continues working properly.  I'd be interested
 in seeing images of both filesystems to investigate and to see if we
 can reproduce it. Errno -95 is -EOPNOTSUPP which is kind of strange to
 see.  I can see a few possible places it would get passed up with a
 trace like that but being able to reproduce it would be extremely helpfu
 l.

In the meantime I got bored and 'fixed' it ( I think ) . The fix
involved copying the /home tree to a different ( xfs ) partition, and
now the error does not appear anymore.

However, I still have the old tree in the btrfs partition ( as /home2
). I will try and reproduce the issue there and submit a reduced image
which triggers the error. Unfortunately I no longer have the old ext4
partition image.

Thanks,

Robert


 - -Jeff

 Jun 11 23:13:08 mars kernel: Hardware name: Gigabyte Technology
 Co., Ltd. EP35-DS4/EP35-DS4, BIOS F6d 01/08/2009 Jun 11 23:13:08
 mars kernel: Workqueue: btrfs-endio-write btrfs_endio_write_helper
 [btrfs] Jun 11 23:13:08 mars kernel:  
 a0a92832 8167c4aa 880128513ca8 Jun 11 23:13:08
 mars kernel:  81063bb1 880031929d28 880221e71800
 ffa1 Jun 11 23:13:08 mars kernel:  a0a914e0
 0b50 81063c2a a0a95928 Jun 11 23:13:08
 mars kernel: Call Trace: Jun 11 23:13:08 mars kernel:
 [8100574c] dump_trace+0x8c/0x340 Jun 11 23:13:08 mars
 kernel:  [81005aa3] show_stack_log_lvl+0xa3/0x190 Jun 11
 23:13:08 mars kernel:  [81007201] show_stack+0x21/0x50
 Jun 11 23:13:08 mars kernel:  [8167c4aa]
 dump_stack+0x47/0x67 Jun 11 23:13:08 mars kernel:
 [81063bb1] warn_slowpath_common+0x81/0xb0 Jun 11 23:13:08
 mars kernel:  [81063c2a] warn_slowpath_fmt+0x4a/0x50 Jun
 11 23:13:08 mars kernel:  [a09e598b]
 __btrfs_abort_transaction+0x4b/0x120 [btrfs] Jun 11 23:13:08 mars
 kernel:  [a0a1d18a] btrfs_finish_ordered_io+0x5aa/0x620
 [btrfs] Jun 11 23:13:08 mars kernel:  [a0a43253]
 normal_work_helper+0xc3/0x320 [btrfs] Jun 11 23:13:08 mars kernel:
 [8107bcf2] process_one_work+0x142/0x420 Jun 11 23:13:08
 mars kernel:  [8107c0e4] worker_thread+0x114/0x460 Jun 11
 23:13:08 mars kernel:  [81081261] kthread+0xc1/0xe0 Jun
 11 23:13:08 mars kernel:  [81682d58]
 ret_from_fork+0x58/0x90 Jun 11 23:13:08 mars kernel: ---[ end trace
 4c4eb7d6e98afa91 ]--- Jun 11 23:13:08 mars kernel: BTRFS: error
 (device sda1) in btrfs_finish_ordered_io:2896: errno=-95 unknown
 Jun 11 23:13:08 mars kernel: BTRFS info (device sda1): forced
 readonly

 Some diagnostic info:

 - btrfs scrub reports no errors - on the host machine I'm running
 btrfs v4.0+20150429 and kernel 4.0.4-3-desktop - on the live
 medium, used to run btrfs-convert, I was running btrfs
 v4.0+20150429 and kernel 4.0.3-1-default

 # btrfs fi show Label: none  uuid:
 

Re: btrfs partition converted from ext4 becomes read-only minutes after booting: WARNING: CPU: 2 PID: 2777 at ../fs/btrfs/super.c:260 __btrfs_abort_transaction+0x4b/0x120

2015-06-18 Thread Robert Munteanu
On Wed, Jun 17, 2015 at 8:46 PM, Marc MERLIN m...@merlins.org wrote:
 On Fri, Jun 12, 2015 at 03:19:06PM +0300, Robert Munteanu wrote:
 Hi,

 Note to others: kernel 4.0.4

 Reply to you:
 I tried ext4 to btrfs once a year ago and it severely mangled my
 filesystem.
 I looked at it as a cool feature/hack that may have worked some time ago, but
 that no one really uses anymore, and that may not work right at this
 point.

 Unless you hear back from a developer interested in debugging/fixing
 this, I would assume that this feature is broken and dead.

I did hear, but in case the general consensus is that this feature is
broken/experimental/unsafe, it would be great to mention it in the
wiki page.

Thanks,

Robert


 Marc

 I have converted my root ext4 partition to btrfs. I used an USB stick
 to boot and used btrfs-convert.

 I also did a balance and defrag ( in that order ) , both when the fs
 was mounted.

 After logging in to KDE I quickly get a read-only filesystem. I've
 pasted the backtrace below

 Jun 11 23:13:08 mars kernel: WARNING: CPU: 2 PID: 2777 at
 ../fs/btrfs/super.c:260 __btrfs_abort_transaction+0x4b/0x120 [btrfs]()
 Jun 11 23:13:08 mars kernel: BTRFS: Transaction aborted (error -95)
 Jun 11 23:13:08 mars kernel: Modules linked in: bnep bluetooth rfkill
 fuse vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) af_packet
 nf_log_ipv6 xt_pkttype nf_log_ip
 v4 nf_log_common xt_LOG xt_limit ip6t_REJECT xt_tcpudp
 nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_raw ipt_REJECT iptable_raw
 xt_CT iptable_filter ip6table_mangle nf_con
 ntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_ipv4
 nf_defrag_ipv4 ip_tables xt_conntrack nf_conntrack ip6table_filter
 ip6_tables x_tables xfs libcrc32c snd_hda
 _codec_hdmi raid1 md_mod gpio_ich ppdev iTCO_wdt iTCO_vendor_support
 coretemp snd_hda_codec_realtek snd_hda_codec_generic kvm_intel
 snd_hda_intel dm_mod kvm snd_hda_co
 ntroller snd_hda_codec snd_hwdep serio_raw pcspkr snd_pcm i2c_i801
 snd_seq joydev snd_seq_device snd_timer snd 8250_fintek parport_pc
 parport acpi_cpufreq lpc_ich
 Jun 11 23:13:08 mars kernel:  soundcore mfd_core shpchp processor
 ata_generic btrfs hid_logitech_hidpp xor raid6_pq sr_mod cdrom
 nvidia_uvm(PO) nvidia(PO) firewire_ohc
 i firewire_core crc_itu_t uas usb_storage r8169 mii pata_jmicron
 hid_logitech_dj drm button sg
 Jun 11 23:13:08 mars kernel: CPU: 2 PID: 2777 Comm: kworker/u8:0
 Tainted: P   O4.0.4-3-desktop #1
 Jun 11 23:13:08 mars kernel: Hardware name: Gigabyte Technology Co.,
 Ltd. EP35-DS4/EP35-DS4, BIOS F6d 01/08/2009
 Jun 11 23:13:08 mars kernel: Workqueue: btrfs-endio-write
 btrfs_endio_write_helper [btrfs]
 Jun 11 23:13:08 mars kernel:   a0a92832
 8167c4aa 880128513ca8
 Jun 11 23:13:08 mars kernel:  81063bb1 880031929d28
 880221e71800 ffa1
 Jun 11 23:13:08 mars kernel:  a0a914e0 0b50
 81063c2a a0a95928
 Jun 11 23:13:08 mars kernel: Call Trace:
 Jun 11 23:13:08 mars kernel:  [8100574c] dump_trace+0x8c/0x340
 Jun 11 23:13:08 mars kernel:  [81005aa3] 
 show_stack_log_lvl+0xa3/0x190
 Jun 11 23:13:08 mars kernel:  [81007201] show_stack+0x21/0x50
 Jun 11 23:13:08 mars kernel:  [8167c4aa] dump_stack+0x47/0x67
 Jun 11 23:13:08 mars kernel:  [81063bb1]
 warn_slowpath_common+0x81/0xb0
 Jun 11 23:13:08 mars kernel:  [81063c2a] 
 warn_slowpath_fmt+0x4a/0x50
 Jun 11 23:13:08 mars kernel:  [a09e598b]
 __btrfs_abort_transaction+0x4b/0x120 [btrfs]
 Jun 11 23:13:08 mars kernel:  [a0a1d18a]
 btrfs_finish_ordered_io+0x5aa/0x620 [btrfs]
 Jun 11 23:13:08 mars kernel:  [a0a43253]
 normal_work_helper+0xc3/0x320 [btrfs]
 Jun 11 23:13:08 mars kernel:  [8107bcf2] 
 process_one_work+0x142/0x420
 Jun 11 23:13:08 mars kernel:  [8107c0e4] worker_thread+0x114/0x460
 Jun 11 23:13:08 mars kernel:  [81081261] kthread+0xc1/0xe0
 Jun 11 23:13:08 mars kernel:  [81682d58] ret_from_fork+0x58/0x90
 Jun 11 23:13:08 mars kernel: ---[ end trace 4c4eb7d6e98afa91 ]---
 Jun 11 23:13:08 mars kernel: BTRFS: error (device sda1) in
 btrfs_finish_ordered_io:2896: errno=-95 unknown
 Jun 11 23:13:08 mars kernel: BTRFS info (device sda1): forced readonly

 Some diagnostic info:

 - btrfs scrub reports no errors
 - on the host machine I'm running btrfs v4.0+20150429 and kernel 
 4.0.4-3-desktop
 - on the live medium, used to run btrfs-convert, I was running btrfs
 v4.0+20150429 and kernel 4.0.3-1-default

 # btrfs fi show
 Label: none  uuid: 54dea125-74cd-4bb2-86a2-f7bc645b76cf
 Total devices 1 FS bytes used 90.22GiB
 devid1 size 223.57GiB used 92.03GiB path /dev/sda1

 btrfs-progs v4.0+20150429

 # btrfs fi df /
 Data, single: total=89.00GiB, used=88.17GiB
 System, single: total=32.00MiB, used=16.00KiB
 Metadata, single: total=3.00GiB, used=2.05GiB
 GlobalReserve, single: total=512.00MiB, used=0.00B

 Is there a way out? I still have the 

i_version vs iversion (Was: Re: [RFC PATCH v2 1/2] Btrfs: add noi_version option to disable MS_I_VERSION)

2015-06-18 Thread David Sterba
Moving the discussion to fsdevel.

Summary: disabling MS_I_VERSION brings some speedups to btrfs, but the
generic 'noiversion' option cannot be used to achieve that. It is
processed before it reaches btrfs superblock callback, where
MS_I_VERSION is forced.

The proposed fix is to add btrfs-specific i_version/noi_version to btrfs,
to which I object.

Continued below.

On Thu, Jun 18, 2015 at 10:46:09AM +0800, Liu Bo wrote:
 On Wed, Jun 17, 2015 at 07:01:18PM +0200, David Sterba wrote:
  On Wed, Jun 17, 2015 at 11:52:36PM +0800, Liu Bo wrote:
   On Wed, Jun 17, 2015 at 05:33:06PM +0200, David Sterba wrote:
On Wed, Jun 17, 2015 at 03:54:31PM +0800, Liu Bo wrote:
 MS_I_VERSION is enabled by default for btrfs, this adds an alternative
 option to toggle it off.

There's an existing generic iversion/noiversion mount option pair, no
need to extra add it to btrfs.
   
   I know, it doesn't work though.
  
  Sigh, I see, btrfs forces MS_I_VERSION flag,
  0c4d2d95d06e920e0c61707e62c7fffc9c57f63a. I read 'enabled by default' as
  that there's a standard way to override the defaults.
  
  So the right way is not to do that but this will break everyhing that
  relies on that behaviour at the moment. This means to add the exception
  to the upper layers, either VFS or 'mount', which is not very likely to
  happen.
  
  The generic options do not reach the filesystem specific callbacks, so
  we can't check it.
 
 Ext4 also makes its own i_version option, so I think we can do the
 same thing until more filesystems require to do it in a generic way.

AFAICS, ext4 had added it's own i_version before iversion was added to
mount:

ext4:

Commit:  25ec56b518257a56d2ff41a941d288e4b5ff9488
Commit date: Mon Jan 28 23:58:27 2008 -0500
Subject: ext4: Add inode version support in ext4

util-linux:

Commti:  4fa5e73d16828c94234ba0aeafaec2470f79011c
Commit date: Thu Nov 27 12:08:44 2008 +0100
Subject: mount: add i_version support

I don't know the history, this looks like adding the options was not
coordinated.

 The performance benefit with no_iversion is obvious for fsync
 related workloads since we would avoid some expensive log commits.

It is obviuos, but I'd like to avoid cluttering the mount options
interface further.

xfs also forces I_VERSION if it detects the superblock version 5, so it
could use the same fix that would work for btrfs.

I see two possibilities that pretend to be generic and clean:

1) the filesystem MS_I_* defaults would be exported and processed up the
   mount call stack

2) pass the full mount options to the filesystem (if requested eg. by
   file_system_type::fs_flags bits).

The other ideas contain 'make an exception to ... ' which does not sound
appealing.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] btrfs: csum: Introduce partial csum for tree block.

2015-06-18 Thread Facebook



On Wed, Jun 17, 2015 at 9:34 PM, Qu Wenruo quwen...@cn.fujitsu.com 
wrote:

Ping?

New new comments?


As our block sizes get bigger, it makes sense to think about more fine 
grained checksums.  We're using crcs for:


1) memory corruption on the way down to the storage.  We could be very 
small (bitflips) or smaller chunks (dma corrupting the whole bio).  The 
places I've seen this in production, the partial crcs might help save a 
percentage of the blocks, but overall the corruptions were just too 
pervasive to get back the data.


2) incomplete writes.  We're sending down up to 64K btree blocks, the 
storage might only write some of them.


3) IO errors from the drive.  These are likely to fail in much bigger 
chunks and the partial csums probably won't help at all.


I think the best way to repair all of these is with replication, either 
RAID5/6 or some number of mirrored copies.  It's more reliable than 
trying to stitch together streams from multiple copies, and the code 
complexity is much lower.


But, where I do find the partial crcs interesting is the ability to 
more accurately detect those three failure modes with our larger block 
sizes.  That's pure statistics based on the crc we've chosen and the 
size of the block.  The right answer might just be a different crc, but 
I'm more than open to data here.


-chris


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] check: check so offset is not bigger then the leaf

2015-06-18 Thread David Sterba
On Thu, Jun 18, 2015 at 01:59:13AM +0200, Robert Marklund wrote:
 This could crash before because of dangerous dangling
 offset of pointer.

That's right, this can happen. There are more btrfs_item_ptr that would
be good to validate that way, namely in the checker as it's most likely
to see corrupted data.

I think it's worth to add a wrapper macro for that, that would be like

(int) btrfs_item_ptr_validate(ei, leaf, slot, struct ..., *optional_key)

and return 0 if it's ok, 1 if there's a problem and prints the details.

 Signed-off-by: Robert Marklund robbelibob...@gmail.com
 ---
  cmds-check.c | 10 ++
  1 file changed, 10 insertions(+)
 
 diff --git a/cmds-check.c b/cmds-check.c
 index 778f141..da36758 100644
 --- a/cmds-check.c
 +++ b/cmds-check.c
 @@ -8906,6 +8906,16 @@ static int build_roots_info_cache(struct btrfs_fs_info 
 *info)
   goto next;
  
   ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 +
 + if ((long long)ei  info-extent_root-leafsize) {
 +  fprintf(stderr, Bad leaf = %p, slot = %d\n, leaf, 
 slot);
 +  fprintf(stderr, item ptr = %p\n, ei);
 +  fprintf(stderr, objectid = %llx\n, 
 found_key.objectid);
 +  fprintf(stderr, type = %x\n, found_key.type);
 +  fprintf(stderr, offset   = %llx\n, found_key.offset);

Hm, I'm not sure whether to continue or fail at this point.

Do you have a crafted filesystem image that can reproduce that or was
that found by code inspection?

 +  goto next;
 + }
 +
   flags = btrfs_extent_flags(leaf, ei);
  
   if (found_key.type == BTRFS_EXTENT_ITEM_KEY 
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] btrfs: csum: Introduce partial csum for tree block.

2015-06-18 Thread David Sterba
On Thu, Jun 18, 2015 at 11:57:46AM -0400, Facebook wrote:
  New new comments?
 
 As our block sizes get bigger, it makes sense to think about more fine 
 grained checksums.  We're using crcs for:
 
 1) memory corruption on the way down to the storage.  We could be very 
 small (bitflips) or smaller chunks (dma corrupting the whole bio).  The 
 places I've seen this in production, the partial crcs might help save a 
 percentage of the blocks, but overall the corruptions were just too 
 pervasive to get back the data.
 
 2) incomplete writes.  We're sending down up to 64K btree blocks, the 
 storage might only write some of them.
 
 3) IO errors from the drive.  These are likely to fail in much bigger 
 chunks and the partial csums probably won't help at all.
 
 I think the best way to repair all of these is with replication, either 
 RAID5/6 or some number of mirrored copies.  It's more reliable than 
 trying to stitch together streams from multiple copies, and the code 
 complexity is much lower.

I agree with that. I'm still not convinced that adding all the kernel
code to repair the data is justified, compared to the block-level
redundancy alternatives.

 But, where I do find the partial crcs interesting is the ability to 
 more accurately detect those three failure modes with our larger block 
 sizes.  That's pure statistics based on the crc we've chosen and the 
 size of the block.  The right answer might just be a different crc, but 
 I'm more than open to data here.

Good point, the detection aspect costs only the increased checksumming
and reporting. My assumption is that this will happen infrequently and
can point out serious hardware problems. In that case taking the
filesytem offline is a workable option and improving the userspace tools
to actually attempt the targeted block repair seems easier. Note that
this would come after redundant raid would not be able to fix it.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] check: check so offset is not bigger then the leaf

2015-06-18 Thread Josef Bacik

On 06/18/2015 09:44 AM, David Sterba wrote:

On Thu, Jun 18, 2015 at 01:59:13AM +0200, Robert Marklund wrote:

This could crash before because of dangerous dangling
offset of pointer.


That's right, this can happen. There are more btrfs_item_ptr that would
be good to validate that way, namely in the checker as it's most likely
to see corrupted data.



The check_block stuff should be doing this, if it isn't that's where we 
need to fix it.  Thanks,


Josef

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] btrfs-progs: Add nbytes output for print-tree and reformat inode output

2015-06-18 Thread Qu Wenruo
The original implement doesn't output nbytes in btrfs_inode.
Add the output and since the output is too long, reformat it to multi
lines.

This is very handy to debug related bugs.

Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
---
 print-tree.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/print-tree.c b/print-tree.c
index df63334..a72a979 100644
--- a/print-tree.c
+++ b/print-tree.c
@@ -841,10 +841,13 @@ void btrfs_print_leaf(struct btrfs_root *root, struct 
extent_buffer *l)
switch (type) {
case BTRFS_INODE_ITEM_KEY:
ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
-   printf(\t\tinode generation %llu transid %llu size 
%llu block group %llu mode %o links %u uid %u gid %u rdev %llu flags 0x%llx\n,
+   printf(\t\tinode generation %llu transid %llu size 
%llu nbytes %llu\n
+  \t\tblock group %llu mode %o links %u uid %u 
gid %u\n
+  \t\trdev %llu flags 0x%llx\n,
   (unsigned long long)btrfs_inode_generation(l, 
ii),
   (unsigned long long)btrfs_inode_transid(l, ii),
   (unsigned long long)btrfs_inode_size(l, ii),
+  (unsigned long long)btrfs_inode_nbytes(l, ii),
   (unsigned long 
long)btrfs_inode_block_group(l,ii),
   btrfs_inode_mode(l, ii),
   btrfs_inode_nlink(l, ii),
-- 
2.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC v7 1/9] vfs: Add sb_want_write() function to grant write access to sb without the struct file/vfsmount.

2015-06-18 Thread Qu Wenruo

Any new comments?

BTW, who is responsible to merge the vfs change patch?
Should btrfs maintainers to merge it or vfs maintainer?

Thanks,
Qu

Qu Wenruo wrote on 2015/02/06 13:45 +0800:

There are sysfs interfaces in some fs, only btrfs yet, which will modify
on-disk data.
Unlike normal file operation routine we can use mnt_want_write_file() to
protect the operation, change through sysfs won't to be binded to any file
in the filesystem.

So introduce new sb_want_write() to do the protection against a super
block, which acts much like mnt_want_write() but will return success if
the super block is read-write.

The implement is to use a atomic value as a simplified rw-semaphore,
which only provides non-block lock method.
We don't use the traditional rw-sem because in do_umount(), we need to
block incoming sb_want_write() until the sb is killed if this is the
last mount instance.
However kill_sb() can be delayed to other thread, so down_write() and
up_write() will happen in different thread, and this is not allowed.

This patch also slightly modified struct super_block and
do_umount/remount(), where we do extra check for blocking sb_want_write()
and don't allow the umount of the *last* mount instance of a super block
or remount it ro.

Cc: linux-fsdevel linux-fsde...@vger.kernel.org
Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
Signed-off-by: Al Viro v...@zeniv.linux.org.uk
Reviewed-by: David Sterba dste...@suse.cz
---
Changelog:
v4:
   Newly introduced.
v5:
   Change name to sb_want_write() and receive sb and parameter.
v6:
   Add better check when umounting the last instance of a super block. So
   sb_want_write() waiting for fs unfrozen/transaction will prevent
   umount.
v7:
   Use atomic instead of manually implemented rw-sem.
   Add check for remount ro.
   Fix some missing unlock in error handler.
   Add internal helper function instead open-code.
---
  fs/internal.h | 25 ++
  fs/namespace.c| 70 +++
  fs/super.c| 15 ++-
  include/linux/fs.h|  6 +
  include/linux/mount.h |  2 ++
  5 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe..8d6ef11 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -69,6 +69,31 @@ extern int __mnt_want_write_file(struct file *);
  extern void __mnt_drop_write(struct vfsmount *);
  extern void __mnt_drop_write_file(struct file *);

+/* rw_sem like read/write down trylock helpers for sb_want_write() */
+static inline int __sb_read_down_trylock(struct super_block *sb)
+{
+   if (!atomic_add_unless(sb-s_want_write_count, 1, -1))
+   return 0;
+   return 1;
+}
+
+static inline int __sb_write_down_trylock(struct super_block *sb)
+{
+   if (atomic_cmpxchg(sb-s_want_write_count, 0, -1))
+   return 0;
+   return 1;
+}
+
+static inline void __sb_read_up(struct super_block *sb)
+{
+   atomic_dec(sb-s_want_write_count);
+}
+
+static inline void __sb_write_up(struct super_block *sb)
+{
+   atomic_set(sb-s_want_write_count, 0);
+}
+
  /*
   * fs_struct.c
   */
diff --git a/fs/namespace.c b/fs/namespace.c
index cd1e968..a4e8946 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1105,6 +1105,47 @@ struct vfsmount *mntget(struct vfsmount *mnt)
  }
  EXPORT_SYMBOL(mntget);

+/**
+ * sb_want_write - get write acess to a super block
+ * @sb: the superblock of the filesystem
+ *
+ * This tells the low-level filesystem that a write is about to be performed to
+ * it, and makes sure that the writes are allowed (superblock is read-write,
+ * filesystem is not frozen) before returning success.
+ * When the write operation is finished, sb_drop_write() must be called.
+ * This is much like mnt_want_write() as a refcount, but only needs
+ * the superblock to be read-write.
+ */
+int sb_want_write(struct super_block *sb)
+{
+
+   if (!__sb_read_down_trylock(sb))
+   return -EBUSY;
+
+   sb_start_write(sb);
+   if (sb-s_readonly_remount || sb-s_flags  MS_RDONLY) {
+   sb_drop_write(sb);
+   return -EROFS;
+   }
+   return 0;
+}
+EXPORT_SYMBOL(sb_want_write);
+
+/**
+ * sb_drop_write - give up write acess to a super block
+ * @sb: the superblock on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done performing writes to it and
+ * also allows filesystem to be frozen/remount ro again. Must be matched with
+ * sb_want_write() call above.
+ */
+void sb_drop_write(struct super_block *sb)
+{
+   sb_end_write(sb);
+   __sb_read_up(sb);
+}
+EXPORT_SYMBOL(sb_drop_write);
+
  struct vfsmount *mnt_clone_internal(struct path *path)
  {
struct mount *p;
@@ -1382,6 +1423,9 @@ static void shrink_submounts(struct mount *mnt);
  static int do_umount(struct mount *mnt, int flags)
  {
struct super_block *sb = mnt-mnt.mnt_sb;
+   struct mount *tmp;
+   int mounts = 0;
+   int sb_write_hold 

Re: [PATCH RFC] btrfs: csum: Introduce partial csum for tree block.

2015-06-18 Thread Qu Wenruo



David Sterba wrote on 2015/06/18 19:06 +0200:

On Thu, Jun 18, 2015 at 11:57:46AM -0400, Facebook wrote:

New new comments?


As our block sizes get bigger, it makes sense to think about more fine
grained checksums.  We're using crcs for:

1) memory corruption on the way down to the storage.  We could be very
small (bitflips) or smaller chunks (dma corrupting the whole bio).  The
places I've seen this in production, the partial crcs might help save a
percentage of the blocks, but overall the corruptions were just too
pervasive to get back the data.

2) incomplete writes.  We're sending down up to 64K btree blocks, the
storage might only write some of them.

3) IO errors from the drive.  These are likely to fail in much bigger
chunks and the partial csums probably won't help at all.

I think the best way to repair all of these is with replication, either
RAID5/6 or some number of mirrored copies.  It's more reliable than
trying to stitch together streams from multiple copies, and the code
complexity is much lower.


I agree with that. I'm still not convinced that adding all the kernel
code to repair the data is justified, compared to the block-level
redundancy alternatives.


Totally agree with this.
That's why we have support for RAID1/5/6/10.

I also hate to add complexity to kernel codes, especially when the scrub 
codes are already quite complex.


But in fact, my teammate Zhao Lei is already doing some work to make 
scrub codes clean and neat.
During his work, one of the thing needs to clean is the function to use 
the bios without IO error to rebuild a tree block from different mirrors.


I found it quite similar with the concept of partial csum, and may 
extract some quite generic codes for both of them, and hope to reduce 
the code amount overall.


But anyway, the main part, scrub support for partial csum, is still just
a basic idea(although some coding is already done), so I hopes to see 
more ideas even it's against partial csum.





But, where I do find the partial crcs interesting is the ability to
more accurately detect those three failure modes with our larger block
sizes.  That's pure statistics based on the crc we've chosen and the
size of the block.  The right answer might just be a different crc, but
I'm more than open to data here.


Good point, the detection aspect costs only the increased checksumming
and reporting. My assumption is that this will happen infrequently and
can point out serious hardware problems. In that case taking the
filesytem offline is a workable option and improving the userspace tools
to actually attempt the targeted block repair seems easier. Note that
this would come after redundant raid would not be able to fix it.



My original cause for partial csum is to improve btrfsck btree repair codes.
Current btree repair codes will drop all child nodes/leaves, which is 
quite a big loss, and deadly if the error happens at tree root.
If using partial csum, we can reduce the damage to as less as 1/8 of the 
node/leave.


So I'm completely OK to implement it in btrfsck, as it's much much 
easier to code and debug in user-space.



But the level of repair is not as high as btrfsck, which overall do 
repair in a higher level, like inode/file/extent repair.

With the nature of partial csum, it leans towards block level more.
So the idea comes to me to do it in kernel scrub codes.

For my personal opinion, if Zhao Lei and I can make the scrub codes much 
clearer and neater, I still consider the kernel scrub implement worthy a 
try.


Thanks,
Qu
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs-progs 4.1-rc1: btrfstune -u reporting incorrect current fsid?

2015-06-18 Thread Qu Wenruo



David Sterba wrote on 2015/06/18 19:33 +0200:

On Wed, Jun 17, 2015 at 10:21:45PM +0100, Mike Fleetwood wrote:

It also upper cases the UUID where as btrfs fi sh and blkid don't.


Ok, I'll switch that to lowercase so it's consistent with the rest.


I've done a quick test on changing the UUID of a btrfs.  It worked, but
btrfstune -u didn't print the same current uuid that btrfs fi sh does.


Seems that the reporting is broken in the btrfstune side. I've
reproduced it here. I've used btrfs-show-super in the tests and did not
notice that the 'current fsid' is wrong. Thanks.


Just a little tip to take less time on the bug:
---
--- a/btrfstune.c
+++ b/btrfstune.c
@@ -349,7 +349,7 @@ static int change_uuid(struct btrfs_fs_info 
*fs_info, const char *new_fsid_str)

fs_info-new_fsid = new_fsid;
fs_info-new_chunk_tree_uuid = new_chunk_id;

-   uuid_parse((const char*)fs_info-fsid, old_fsid);
+   memcpy(old_fsid, fs_info-fsid, BTRFS_UUID_SIZE);
uuid_unparse_upper(old_fsid, uuid_buf);
printf(Current fsid: %s\n, uuid_buf);

---

Also, you can remove the old_fsid variant if you want and just use 
fs_info-fsid.


Thanks,
Qu


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: add autodefrag inode flag

2015-06-18 Thread Omar Sandoval
In some cases, we may not want to enable automatic defragmentation for
the whole filesystem with the autodefrag mount option but we still
want to defragment specific files or directories. Add an inode flag
which allows us to do specify that.

Signed-off-by: Omar Sandoval osan...@fb.com
---
 fs/btrfs/ctree.h|  1 +
 fs/btrfs/file.c | 18 ++
 fs/btrfs/ioctl.c| 13 +++--
 include/uapi/linux/fs.h |  1 +
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6f364e1d8d3d..e898bb2822ef 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2234,6 +2234,7 @@ do {  
 \
 #define BTRFS_INODE_NOATIME(1  9)
 #define BTRFS_INODE_DIRSYNC(1  10)
 #define BTRFS_INODE_COMPRESS   (1  11)
+#define BTRFS_INODE_AUTODEFRAG (1  12)
 
 #define BTRFS_INODE_ROOT_ITEM_INIT (1  31)
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b072e17479aa..33bead79da7a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -129,15 +129,17 @@ static int __btrfs_add_inode_defrag(struct inode *inode,
return 0;
 }
 
-static inline int __need_auto_defrag(struct btrfs_root *root)
+static inline int __need_auto_defrag(struct btrfs_root *root,
+struct inode *inode)
 {
-   if (!btrfs_test_opt(root, AUTO_DEFRAG))
-   return 0;
-
if (btrfs_fs_closing(root-fs_info))
return 0;
 
-   return 1;
+   if (btrfs_test_opt(root, AUTO_DEFRAG))
+   return 1;
+   if (BTRFS_I(inode)-flags  BTRFS_INODE_AUTODEFRAG)
+   return 1;
+   return 0;
 }
 
 /*
@@ -152,7 +154,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
u64 transid;
int ret;
 
-   if (!__need_auto_defrag(root))
+   if (!__need_auto_defrag(root, inode))
return 0;
 
if (test_bit(BTRFS_INODE_IN_DEFRAG, BTRFS_I(inode)-runtime_flags))
@@ -199,7 +201,7 @@ static void btrfs_requeue_inode_defrag(struct inode *inode,
struct btrfs_root *root = BTRFS_I(inode)-root;
int ret;
 
-   if (!__need_auto_defrag(root))
+   if (!__need_auto_defrag(root, inode))
goto out;
 
/*
@@ -372,7 +374,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 fs_info-fs_state))
break;
 
-   if (!__need_auto_defrag(fs_info-tree_root))
+   if (btrfs_fs_closing(fs_info))
break;
 
/* find an inode to defrag */
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1c22c6518504..c1a45d507613 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -121,6 +121,8 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
iflags |= FS_DIRSYNC_FL;
if (flags  BTRFS_INODE_NODATACOW)
iflags |= FS_NOCOW_FL;
+   if (flags  BTRFS_INODE_AUTODEFRAG)
+   iflags |= FS_AUTODEFRAG_FL;
 
if ((flags  BTRFS_INODE_COMPRESS)  !(flags  BTRFS_INODE_NOCOMPRESS))
iflags |= FS_COMPR_FL;
@@ -157,7 +159,7 @@ void btrfs_update_iflags(struct inode *inode)
 /*
  * Inherit flags from the parent inode.
  *
- * Currently only the compression flags and the cow flags are inherited.
+ * Currently only the compression, cow, and autodefrag flags are inherited.
  */
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 {
@@ -182,6 +184,9 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode 
*dir)
BTRFS_I(inode)-flags |= BTRFS_INODE_NODATASUM;
}
 
+   if (flags  BTRFS_INODE_AUTODEFRAG)
+   BTRFS_I(inode)-flags |= BTRFS_INODE_AUTODEFRAG;
+
btrfs_update_iflags(inode);
 }
 
@@ -201,7 +206,7 @@ static int check_flags(unsigned int flags)
  FS_NOATIME_FL | FS_NODUMP_FL | \
  FS_SYNC_FL | FS_DIRSYNC_FL | \
  FS_NOCOMP_FL | FS_COMPR_FL |
- FS_NOCOW_FL))
+ FS_NOCOW_FL | FS_AUTODEFRAG_FL))
return -EOPNOTSUPP;
 
if ((flags  FS_NOCOMP_FL)  (flags  FS_COMPR_FL))
@@ -278,6 +283,10 @@ static int btrfs_ioctl_setflags(struct file *file, void 
__user *arg)
ip-flags |= BTRFS_INODE_DIRSYNC;
else
ip-flags = ~BTRFS_INODE_DIRSYNC;
+   if (flags  FS_AUTODEFRAG_FL)
+   ip-flags |= BTRFS_INODE_AUTODEFRAG;
+   else
+   ip-flags = ~BTRFS_INODE_AUTODEFRAG;
if (flags  FS_NOCOW_FL) {
if (S_ISREG(mode)) {
/*
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 9b964a5920af..8c7e8c7a0236 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -197,6 +197,7 @@ struct inodes_stat_t {
 #define FS_EXTENT_FL   

Re: [PATCH v9 9/9] btrfs: Use sb_want_write() to protect sysfs feature change.

2015-06-18 Thread Qu Wenruo

OK, it's 4.2 merge windows now...

But still no new comments?

Thanks,
Qu

Qu Wenruo wrote on 2015/05/15 14:47 +0800:

Ping.

Any comments?
Other v7 patchset is reviewed by David.

But I didn't find it in 4.1 merge windows.
Is something wrong or we are waiting for the vfs patch merged first?

Thanks,
Qu

 Original Message  
Subject: [PATCH v9 9/9] btrfs: Use sb_want_write() to protect sysfs
feature change.
From: Qu Wenruo quwen...@cn.fujitsu.com
To: linux-btrfs@vger.kernel.org
Date: 2015年02月16日 09:02


Just like label change, use sb_want_write() to do a correct protection.

Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
---
v4:
   Newly introduced.
v5:
   Change to sb_want_write().
v6:
   Move sb_want_write() to the beginning of the function.
v7:
   None
v8:
   Move sb_want_write() after get fs_info.
v9:
   Fix a unpaired sb_drop_write() in error handler.
---
  fs/btrfs/sysfs.c | 25 +
  1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 7e548f7..19876ba 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -121,10 +121,14 @@ static ssize_t btrfs_feature_attr_store(struct
kobject *kobj,
  if (!fs_info)
  return -EPERM;

-ret = kstrtoul(skip_spaces(buf), 0, val);
+ret = sb_want_write(fs_info-sb);
  if (ret)
  return ret;

+ret = kstrtoul(skip_spaces(buf), 0, val);
+if (ret)
+goto out;
+
  if (fa-feature_set == FEAT_COMPAT) {
  set = BTRFS_FEATURE_COMPAT_SAFE_SET;
  clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
@@ -141,22 +145,25 @@ static ssize_t btrfs_feature_attr_store(struct
kobject *kobj,
  /* Nothing to do */
  if ((val  (features  fa-feature_bit)) ||
  (!val  !(features  fa-feature_bit)))
-return count;
+goto out;

  if ((val  !(set  fa-feature_bit)) ||
  (!val  !(clear  fa-feature_bit))) {
  btrfs_info(fs_info,
  %sabling feature %s on mounted fs is not supported.,
  val ? En : Dis, fa-kobj_attr.attr.name);
-return -EPERM;
+ret = -EPERM;
+goto out;
  }

  btrfs_info(fs_info, %s %s feature flag,
 val ? Setting : Clearing, fa-kobj_attr.attr.name);

  trans = btrfs_start_transaction(fs_info-fs_root, 0);
-if (IS_ERR(trans))
-return PTR_ERR(trans);
+if (IS_ERR(trans)) {
+ret = PTR_ERR(trans);
+goto out;
+}

  spin_lock(fs_info-super_lock);
  features = get_features(fs_info, fa-feature_set);
@@ -168,10 +175,12 @@ static ssize_t btrfs_feature_attr_store(struct
kobject *kobj,
  spin_unlock(fs_info-super_lock);

  ret = btrfs_commit_transaction(trans, fs_info-fs_root);
-if (ret)
-return ret;

-return count;
+out:
+sb_drop_write(fs_info-sb);
+if (!ret)
+return count;
+return ret;
  }

  static umode_t btrfs_feature_visible(struct kobject *kobj,


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/5] Btrfs: incremental send, fix rmdir not send utimes

2015-06-18 Thread Filipe David Manana
On Thu, Jun 18, 2015 at 4:21 AM, Robbie Ko robbi...@synology.com wrote:
 Hi Filipe,

 I've found that the following case is the main cause of such error
 and it's fs tree is shown via btrfs-debug-tress as below.

 file tree key (459 ROOT_ITEM 20487)
 node 132988928 level 1 items 3 free 490 generation 20487 owner 459
 fs uuid b451ae42-3b03-4003-b0a4-45dce324557f
 chunk uuid d8831db3-2e42-4b32-9a5c-3efdf50d36bc
 key (256 INODE_ITEM 0) block 132710400 (8100) gen 20486
 key (264 INODE_ITEM 0) block 130695168 (7977) gen 20480
 key (266 XATTR_ITEM 952319794) block 126042112 (7693) gen 20464
 leaf 132710400 items 166 free space 3639 generation 20486 owner 455
 fs uuid b451ae42-3b03-4003-b0a4-45dce324557f
 chunk uuid d8831db3-2e42-4b32-9a5c-3efdf50d36bc
 item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160
 inode generation 20425 transid 20442 size 32 block
 group 0 mode 40755 links 1 uid 0 gid 0 rdev 0 flags 0x0
 item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12
 inode ref index 0 namelen 2 name: ..
 ...
 item 165 key (262 XATTR_ITEM 1100961104) itemoff 7789 itemsize 39
 location key (0 UNKNOWN.0 0) type XATTR
 namelen 8 datalen 1 name: user.a78
 data a
 binary 61
 leaf 130695168 items 133 free space 7332 generation 20480 owner 455
 fs uuid b451ae42-3b03-4003-b0a4-45dce324557f
 chunk uuid d8831db3-2e42-4b32-9a5c-3efdf50d36bc
 item 0 key (264 INODE_ITEM 0) itemoff 16123 itemsize 160
 inode generation 20428 transid 20434 size 10 block
 group 0 mode 40755 links 1 uid 0 gid 0 rdev 0 flags 0x0
 item 1 key (264 INODE_REF 256) itemoff 16112 itemsize 11
 inode ref index 11 namelen 1 name: c
 ...

 We can see that inode 262 is right at the end of leaf. Then send_utime() will
 use btrfs_search_slot() to find a appropriate place to put 262 where is at the
  back of 262. However, that place is uninitialized on disk.
 Suppose we read
 atime tv_sec:576469548413222912, tv_nsec:1919251317 and then send it out.
 Receiving side will  got EINVAL since tv_nsec:1919251317 is greater
 than   999,999,999.

I see.
So in apply_dir_move, instead of searching the btree of the send
snapshot, we can search the rbtree of orphan dir infos for an entry
with a key == cur-dir. Searching that rbtree makes it clear what the
intention is and more efficient (fully in memory structure, and much
smaller than the btree). Should work, but I haven't tested it.

thanks


 Thanks.
 Robbie Ko

 2015-06-10 18:06 GMT+08:00 Robbie Ko robbi...@synology.com:
 Hi Filipi,

 2015-06-09 18:36 GMT+08:00 Filipe David Manana fdman...@gmail.com:
 On Tue, Jun 9, 2015 at 11:04 AM, Robbie Ko robbi...@synology.com wrote:
 Hi Filipe,

 2015-06-08 22:00 GMT+08:00 Filipe David Manana fdman...@gmail.com:
 On Mon, Jun 8, 2015 at 4:44 AM, Robbie Ko robbi...@synology.com wrote:
 Hi Filipe,

 Hi Robbie,


 I've fixed don't send utimes for non-existing directory with another 
 solution.

  In apply_dir_move(), the old parent dir. and new parent dir. will be
 updated after the current dir. has moved.

 And there's only one entry in old parent dir. (e.g. entry with
 smallest ino) will be tagged with rmdir_ino to prevent its parent dir.
 is deleted but updated.

 Can't parse this phrase. What do you mean by tagging an entry with 
 rmdir_ino?
 rmdir_ino corresponds to the number of a inode that wasn't deleted
 when it was processed because there was some inode with a lower number
 that is a child of the directory in the parent snapshot and had its
 rename/move operation delayed (it happens after the directory we want
 to delete is processed).


 Right , my tagged with rmdir_ino is same meaning as you explained here.


 However, if we process rename for another entry not tagged with
 rmdir_ino first, its old parent dir. which is deleted  will be updated
 according to apply_dir_move().

 Therefore, I think we should check the existence of  the dir. before
 we're going to update it's utime.

 The patch is pasted in the following link, could you give me some 
 comment?

 https://friendpaste.com/h8tZqOS9iAUpp2DvgGI2k

 Looks better.
 However I still don't understand your explanation, and just tried the
 example in your commit message:

 Parent snapshot:

 | a/ (ino 259)
   | c (ino 264)
 | b/ (ino 260)
   | d (ino 265)
 | del/ (ino 263)
   | item1/ (ino 261)
   | item2/ (ino 262)

 Send snapshot:
 | a/ (ino 259)
 | b/ (ino 260)
 | c/ (ino 2)
   | item2 (ino 259)
 | d/ (ino 257)
   | item1/ (ino 258)

 So it's confusing after looking at it.
 First the send snapshot mentions inode number 2, which doesn't exist
 in the parent snapshot - I assume you meant inode number 264.
 Then, the send snapshot has two inodes with number 259. Is item2 in
 the send snapshot supposed to be inode 262?


 Your guess is right. And I correct it as follow.

  # 

Re: btrfs-progs 4.1-rc1: btrfstune -u reporting incorrect current fsid?

2015-06-18 Thread David Sterba
On Wed, Jun 17, 2015 at 10:21:45PM +0100, Mike Fleetwood wrote:
 It also upper cases the UUID where as btrfs fi sh and blkid don't.

Ok, I'll switch that to lowercase so it's consistent with the rest.

 I've done a quick test on changing the UUID of a btrfs.  It worked, but
 btrfstune -u didn't print the same current uuid that btrfs fi sh does.

Seems that the reporting is broken in the btrfstune side. I've
reproduced it here. I've used btrfs-show-super in the tests and did not
notice that the 'current fsid' is wrong. Thanks.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] btrfs: fix deadlock with extent-same and readpage

2015-06-18 Thread Mark Fasheh
-readpage() does page_lock() before extent_lock(), we do the opposite in
extent-same. We want to reverse the order in btrfs_extent_same() but it's
not quite straightforward since the page locks are taken inside 
btrfs_cmp_data().

So I split btrfs_cmp_data() into 3 parts with a small context structure that
is passed between them. The first, btrfs_cmp_data_prepare() gathers up the
pages needed (taking page lock as required) and puts them on our context
structure. At this point, we are safe to lock the extent range. Afterwards,
we use btrfs_cmp_data() to do the data compare as usual and 
btrfs_cmp_data_free()
to clean up our context.

Signed-off-by: Mark Fasheh mfas...@suse.de
---
 fs/btrfs/ioctl.c | 148 +++
 1 file changed, 117 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2deea1f..b899584 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2755,14 +2755,11 @@ out:
return ret;
 }
 
-static struct page *extent_same_get_page(struct inode *inode, u64 off)
+static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
 {
struct page *page;
-   pgoff_t index;
struct extent_io_tree *tree = BTRFS_I(inode)-io_tree;
 
-   index = off  PAGE_CACHE_SHIFT;
-
page = grab_cache_page(inode-i_mapping, index);
if (!page)
return NULL;
@@ -2783,6 +2780,20 @@ static struct page *extent_same_get_page(struct inode 
*inode, u64 off)
return page;
 }
 
+static int gather_extent_pages(struct inode *inode, struct page **pages,
+  int num_pages, u64 off)
+{
+   int i;
+   pgoff_t index = off  PAGE_CACHE_SHIFT;
+
+   for (i = 0; i  num_pages; i++) {
+   pages[i] = extent_same_get_page(inode, index + i);
+   if (!pages[i])
+   return -ENOMEM;
+   }
+   return 0;
+}
+
 static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
 {
/* do any pending delalloc/csum calc on src, one way or
@@ -2808,52 +2819,120 @@ static inline void lock_extent_range(struct inode 
*inode, u64 off, u64 len)
}
 }
 
-static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
-   struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_unlock(struct inode *inode1, struct inode 
*inode2)
 {
-   unlock_extent(BTRFS_I(inode1)-io_tree, loff1, loff1 + len - 1);
-   unlock_extent(BTRFS_I(inode2)-io_tree, loff2, loff2 + len - 1);
-
mutex_unlock(inode1-i_mutex);
mutex_unlock(inode2-i_mutex);
 }
 
-static void btrfs_double_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
+{
+   if (inode1  inode2)
+   swap(inode1, inode2);
+
+   mutex_lock_nested(inode1-i_mutex, I_MUTEX_PARENT);
+   if (inode1 != inode2)
+   mutex_lock_nested(inode2-i_mutex, I_MUTEX_CHILD);
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+   unlock_extent(BTRFS_I(inode1)-io_tree, loff1, loff1 + len - 1);
+   unlock_extent(BTRFS_I(inode2)-io_tree, loff2, loff2 + len - 1);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+struct inode *inode2, u64 loff2, u64 len)
 {
if (inode1  inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
}
-
-   mutex_lock_nested(inode1-i_mutex, I_MUTEX_PARENT);
lock_extent_range(inode1, loff1, len);
-   if (inode1 != inode2) {
-   mutex_lock_nested(inode2-i_mutex, I_MUTEX_CHILD);
+   if (inode1 != inode2)
lock_extent_range(inode2, loff2, len);
+}
+
+struct cmp_pages {
+   int num_pages;
+   struct page **src_pages;
+   struct page **dst_pages;
+};
+
+static void btrfs_cmp_data_free(struct cmp_pages *cmp)
+{
+   int i;
+   struct page *pg;
+
+   for (i = 0; i  cmp-num_pages; i++) {
+   pg = cmp-src_pages[i];
+   if (pg)
+   page_cache_release(pg);
+   pg = cmp-dst_pages[i];
+   if (pg)
+   page_cache_release(pg);
+   }
+   kfree(cmp-src_pages);
+   kfree(cmp-dst_pages);
+}
+
+static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
+ struct inode *dst, u64 dst_loff,
+ u64 len, struct cmp_pages *cmp)
+{
+   int ret;
+   int num_pages = PAGE_CACHE_ALIGN(len)  PAGE_CACHE_SHIFT;
+   struct page **src_pgarr, **dst_pgarr;
+
+   /*
+* We must gather up all the pages before we initiate our
+* extent locking. We use an 

[PATCH 1/2] btrfs: pass unaligned length to btrfs_cmp_data()

2015-06-18 Thread Mark Fasheh
In the case that we dedupe the tail of a file, we might expand the dedupe
len out to the end of our last block. We don't want to compare data past
i_size however, so pass the original length to btrfs_cmp_data().

Signed-off-by: Mark Fasheh mfas...@suse.de
---
 fs/btrfs/ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2d24ff4..2deea1f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2933,7 +2933,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, 
u64 olen,
goto out_unlock;
}
 
-   ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+   /* pass original length for comparison so we stay within i_size */
+   ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen);
if (ret == 0)
ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
 
-- 
2.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] btrfs: two more dedupe fixes

2015-06-18 Thread Mark Fasheh
Hi Chris,

   The following fixes are based on top of my patch titled btrfs:
Handle unaligned length in extent_same which you have in your
'integration-4.2' branch:

https://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?id=e1d227a42ea2b4664f94212bd1106b9a3413ffb8

The first patch in the series fixes a bug where we were sometimes
passing the aligned length to our comparison function. We actually can
stop at the user passed length for this as we don't need to compare
data past i_size (and we only align if the extents go out to i_size).

The 2nd patch fixes a deadlock between btrfs readpage and
btrfs_extent_same. This was reported on the list some months ago -
basically we had the page and extent locking reversed. My patch fixes
up the locking to be in the right order.

Thanks,
   --Mark
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: alias btrfs device delete to btrfs device remove

2015-06-18 Thread Omar Sandoval
There's an awkward asymmetry between btrfs device add and btrfs device
delete. Resolve this by aliasing delete to remove.

Signed-off-by: Omar Sandoval osan...@fb.com
---
 Documentation/btrfs-device.asciidoc |  5 -
 cmds-device.c   | 33 +++--
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/Documentation/btrfs-device.asciidoc 
b/Documentation/btrfs-device.asciidoc
index c56cf5ef48fb..2827598a37f5 100644
--- a/Documentation/btrfs-device.asciidoc
+++ b/Documentation/btrfs-device.asciidoc
@@ -74,9 +74,12 @@ do not perform discard by default
 -f|--force
 force overwrite of existing filesystem on the given disk(s)
 
-*delete* dev [dev...] path::
+*remove* dev [dev...] path::
 Remove device(s) from a filesystem identified by path.
 
+*delete* dev [dev...] path::
+Alias of remove kept for backwards compatability
+
 *ready* device::
 Check device to see if it has all of it's devices in cache for mounting.
 
diff --git a/cmds-device.c b/cmds-device.c
index 1022656988c2..0e1ea94a0e41 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -145,20 +145,17 @@ error_out:
 }
 
 static const char * const cmd_rm_dev_usage[] = {
-   btrfs device delete device [device...] path,
+   btrfs device remove device [device...] path,
Remove a device from a filesystem,
NULL
 };
 
-static int cmd_rm_dev(int argc, char **argv)
+static int _cmd_rm_dev(int argc, char **argv)
 {
char*mntpnt;
int i, fdmnt, ret=0, e;
DIR *dirstream = NULL;
 
-   if (check_argc_min(argc, 3))
-   usage(cmd_rm_dev_usage);
-
mntpnt = argv[argc - 1];
 
fdmnt = open_file_or_dir(mntpnt, dirstream);
@@ -198,6 +195,29 @@ static int cmd_rm_dev(int argc, char **argv)
return !!ret;
 }
 
+static int cmd_rm_dev(int argc, char **argv)
+{
+   if (check_argc_min(argc, 3))
+   usage(cmd_rm_dev_usage);
+
+   return _cmd_rm_dev(argc, argv);
+}
+
+
+static const char * const cmd_del_dev_usage[] = {
+   btrfs device delete device [device...] path,
+   Remove a device from a filesystem (alias of remove),
+   NULL
+};
+
+static int cmd_del_dev(int argc, char **argv)
+{
+   if (check_argc_min(argc, 3))
+   usage(cmd_del_dev_usage);
+
+   return _cmd_rm_dev(argc, argv);
+}
+
 static const char * const cmd_scan_dev_usage[] = {
btrfs device scan [(-d|--all-devices)|device [device...]],
Scan devices for a btrfs filesystem,
@@ -586,12 +606,13 @@ out:
 const struct cmd_group device_cmd_group = {
device_cmd_group_usage, NULL, {
{ add, cmd_add_dev, cmd_add_dev_usage, NULL, 0 },
-   { delete, cmd_rm_dev, cmd_rm_dev_usage, NULL, 0 },
+   { remove, cmd_rm_dev, cmd_rm_dev_usage, NULL, 0 },
{ scan, cmd_scan_dev, cmd_scan_dev_usage, NULL, 0 },
{ ready, cmd_ready_dev, cmd_ready_dev_usage, NULL, 0 },
{ stats, cmd_dev_stats, cmd_dev_stats_usage, NULL, 0 },
{ usage, cmd_device_usage,
cmd_device_usage_usage, NULL, 0 },
+   { delete, cmd_del_dev, cmd_del_dev_usage, NULL, 0 },
NULL_CMD_STRUCT
}
 };
-- 
2.4.3

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Corrupted btrfs partition (converted from ext4) after balance

2015-06-18 Thread Vianney Stroebel

One of my btrfs partition seems to have been corrupted.

Since I've tried to balance it, I can only mount it read-only. I have been able 
to use it read-only without problem so far so the data seems safe.

When I remove the ro option, the mount command hangs and some programs do 
not function properly (iotop hangs too and Firefox cannot load new web pages). Every few seconds a 
message is printed on syslog (see attached file).

If I try to terminate the mount process with ctrl+c, my whole system hangs.

This partition was converted from ext4 and I could use it fine after that. It got 
corrupted when I tried to balance it a few days ago (even though I think I had balanced 
it before, but I'm not sure about this). The balance would seem to have started but 
balance status showed no progress even after one hour.

This partition is on one hard disk (no raid). Mount options: 
defaults,compress=lzo,noatime,nodiratime,noauto,ro). My system also runs on 
btrfs on another disk (ssd) without any problems apart from quite poor 
performance (but that's for another post).

The command btrfs scrub start /_big -r hangs my system.

The command Konsole output btrfs check /dev/sdb1 outputs :
Checking filesystem on /dev/sdb1
UUID: 21873ba7-438a-4fbf-a051-ace28bffd264
checking extents
and stops after a few minutes with no other output.

I did not try btrfs check --repair.

Btrfs-zero-log doesn't seem to apply here.
Konsole output
I could copy the data on another freshly formatted disk and reformat this one 
but I am wondering if btrfs is stable enough to be used on my professional 
laptop (where I cannot afford such downtime)or if I should go back to ext4.

So the goal of this message is not only to see if I can repair this partition, 
but also to assess if btrfs corrupt partitions randomly and irreversibly. If 
the root cause resides in a non-essential feature (conversion or balancing for 
example), I would happily continue to use it without this feature.

This is my first message on this mailing list. I've spent the last hours trying 
to solve this.

More info:

uname -a
Linux viybel-pc 3.19.0-21-generic #21-Ubuntu SMP Sun Jun 14 18:31:11 UTC 2015 
x86_64 x86_64 x86_64 GNU/Linux

btrfs --version
btrfs-progs v3.19.1

btrfs fi show
Label: none  uuid: 358f485d-690d-436d-ad35-3a1f47329ed7
Total devices 1 FS bytes used 107.75GiB
devid1 size 111.79GiB used 111.79GiB path /dev/sda1

Label: none  uuid: 21873ba7-438a-4fbf-a051-ace28bffd264
Total devices 1 FS bytes used 606.17GiB
devid1 size 698.63GiB used 660.03GiB path /dev/sdb1

btrfs fi df /_big
Data, single: total=431.00GiB, used=419.49GiB
System, single: total=32.00MiB, used=64.00KiB
Metadata, single: total=229.00GiB, used=186.67GiB
GlobalReserve, single: total=512.00MiB, used=0.00B

dmesg  dmesg.log (attached)

Konsole outp
Vianney

Jun 18 23:38:30 viybel-pc kernel: [  903.963705] NMI watchdog: BUG: soft lockup 
- CPU#1 stuck for 22s! [mount:2556]
Jun 18 23:38:30 viybel-pc kernel: [  903.963714] Modules linked in: pci_stub 
vboxpci(OE) vboxnetadp(OE) vboxnetflt(OE) vboxdrv(OE) binfmt_misc btusb 
bluetooth uvcvideo hp_wmi videobuf2_vmalloc snd_hda_codec_hdmi(OE) 
sparse_keymap videobuf2_memops videobuf2_core v4l2_common arc4 videodev 
snd_hda_codec_idt(OE) media snd_hda_codec_generic(OE) snd_hda_intel(OE) 
snd_hda_codec(OE) snd_hda_core(OE) iwldvm snd_hwdep mac80211 snd_pcm 
intel_powerclamp coretemp snd_seq_midi snd_seq_midi_event kvm_intel kvm 
snd_rawmidi iwlwifi snd_seq joydev cfg80211 serio_raw snd_seq_device snd_timer 
snd hp_accel tpm_infineon lis3lv02d 8250_fintek input_polldev mei_me mac_hid 
lpc_ich mei soundcore intel_ips shpchp parport_pc ppdev lp parport autofs4 
btrfs xor raid6_pq hid_generic usbhid hid mmc_block i915 e1000e i2c_algo_bit 
drm_kms_helper firewire_ohci psmouse ptp ahci firewire_core sdhci_pci drm 
pps_core libahci sdhci crc_itu_t wmi video
Jun 18 23:38:30 viybel-pc kernel: [  903.963812] CPU: 1 PID: 2556 Comm: mount 
Tainted: GW  OEL 3.19.0-21-generic #21-Ubuntu
Jun 18 23:38:30 viybel-pc kernel: [  903.963816] Hardware name: Hewlett-Packard 
HP ProBook 6550b/146D, BIOS 68CDE Ver. F.06 01/11/2011
Jun 18 23:38:30 viybel-pc kernel: [  903.963820] task: 88006a7fd850 ti: 
88006b91 task.ti: 88006b91
Jun 18 23:38:30 viybel-pc kernel: [  903.963823] RIP: 0010:[817caac2] 
 [817caac2] _raw_spin_lock+0x12/0x80
Jun 18 23:38:30 viybel-pc kernel: [  903.963836] RSP: 0018:88006b913b18  
EFLAGS: 0282
Jun 18 23:38:30 viybel-pc kernel: [  903.963839] RAX: ae7cae7c RBX: 
8830 RCX: 0053
Jun 18 23:38:30 viybel-pc kernel: [  903.963842] RDX: ae7a RSI: 
0246 RDI: 8800b4e1dd70
Jun 18 23:38:30 viybel-pc kernel: [  903.963844] RBP: 88006b913b18 R08: 
0001ccc4 R09: 05f4
Jun 18 23:38:30 viybel-pc kernel: [  903.963847] R10: 0096 R11: 
05f4 R12: 817c1853
Jun 18 23:38:30 

[PATCH 2/2] btrfs-progs: Modify document and help message of fi show for units options

2015-06-18 Thread Qu Wenruo
Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
---
 Documentation/btrfs-filesystem.asciidoc | 28 +++-
 cmds-filesystem.c   |  8 
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/Documentation/btrfs-filesystem.asciidoc 
b/Documentation/btrfs-filesystem.asciidoc
index f1c35b6..033d3aa 100644
--- a/Documentation/btrfs-filesystem.asciidoc
+++ b/Documentation/btrfs-filesystem.asciidoc
@@ -117,15 +117,33 @@ partition after reducing the size of the filesystem.  
This can done using
 it with the new desired size.  When recreating the partition make sure to use
 the same starting disk cylinder as before.
 
-*show* [--mounted|--all-devices|path|uuid|device|label]::
+*show* [options] [path|uuid|device|label]::
 Show the btrfs filesystem with some additional info.
 +
 If no option nor path|uuid|device|label is passed, btrfs shows
 information of all the btrfs filesystem both mounted and unmounted.
-If '--mounted' is passed, it would probe btrfs kernel to list mounted btrfs
-filesystem(s);
-If '--all-devices' is passed, all the devices under /dev are scanned;
-otherwise the devices list is extracted from the /proc/partitions file.
++
+`Options`
++
+--mounted
+probe btrfs kernel to list mounted btrfs filesystems(s)
+--all-devices
+scan all devices under /dev, otherwise the devices list is extracted from the
+/proc/partitions file.
+--human-readable
+print human friendly numbers, base 1024, this is the default
+--iec
+select the 1024 base for the following options, according to the IEC standard
+--si
+select the 1000 base for the following options, according to the SI standard
+--kbytes
+show sizes in KiB, or kB with --si
+--mbytes
+show sizes in MiB, or MB with --si
+--gbytes
+show sizes in GiB, or GB with --si
+--tbytes
+show sizes in TiB, or TB with --si
 
 *sync* path::
 Force a sync for the filesystem identified by path.
diff --git a/cmds-filesystem.c b/cmds-filesystem.c
index a9dbcfe..688fdb3 100644
--- a/cmds-filesystem.c
+++ b/cmds-filesystem.c
@@ -820,6 +820,14 @@ static const char * const cmd_show_usage[] = {
Show the structure of a filesystem,
-d|--all-devices   show only disks under /dev containing btrfs 
filesystem,
-m|--mounted   show only mounted btrfs,
+   --raw  raw numbers in bytes,
+   -human-readablehuman friendly numbers, base 1024(default),
+   --iec  use 1024 as a base (KiB, MiB, GiB, TiB),
+   --si   use 1000 as a base (kB, MB, GB, TB),
+   --kbytes   show sizes in KiB, or kB with --si,
+   --mbytes   show sizes in MiB, or MB with --si,
+   --gbytes   show sizes in GiB, or GB with --si,
+   --tbytes   show sizes in TiB, or TB with --si,
If no argument is given, structure of all present filesystems is 
shown.,
NULL
 };
-- 
2.4.3

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: fix fsync data loss after append write

2015-06-18 Thread Liu Bo
On Wed, Jun 17, 2015 at 12:49:23PM +0100, fdman...@kernel.org wrote:
 From: Filipe Manana fdman...@suse.com
 
 If we do an append write to a file (which increases its inode's i_size)
 that does not have the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode,
 and the previous transaction added a new hard link to the file, which sets
 the flag BTRFS_INODE_COPY_EVERYTHING in the file's inode, and then fsync
 the file, the inode's new i_size isn't logged. This has the consequence
 that after the fsync log is replayed, the file size remains what it was
 before the append write operation, which means users/applications will
 not be able to read the data that was successsfully fsync'ed before.
 
 This happens because neither the inode item nor the delayed inode get
 their i_size updated when the append write is made - doing so would
 require starting a transaction in the buffered write path, something that
 we do not do intentionally for performance reasons.
 
 Fix this by making sure that when the flag BTRFS_INODE_COPY_EVERYTHING is
 set the inode is logged with its current i_size (log the in-memory inode
 into the log tree).

Looks good to me.

Reviewed-by: Liu Bo bo.li@oracle.com

Thanks,

-liubo
 
 This issue is not a recent regression and is easy to reproduce with the
 following test case for fstests:
 
   seq=`basename $0`
   seqres=$RESULT_DIR/$seq
   echo QA output created by $seq
 
   here=`pwd`
   tmp=/tmp/$$
   status=1# failure is the default!
 
   _cleanup()
   {
   _cleanup_flakey
   rm -f $tmp.*
   }
   trap _cleanup; exit \$status 0 1 2 3 15
 
   # get standard environment, filters and checks
   . ./common/rc
   . ./common/filter
   . ./common/dmflakey
 
   # real QA test starts here
   _supported_fs generic
   _supported_os Linux
   _need_to_be_root
   _require_scratch
   _require_dm_flakey
   _require_metadata_journaling $SCRATCH_DEV
 
   _crash_and_mount()
   {
   # Simulate a crash/power loss.
   _load_flakey_table $FLAKEY_DROP_WRITES
   _unmount_flakey
   # Allow writes again and mount. This makes the fs replay its fsync 
 log.
   _load_flakey_table $FLAKEY_ALLOW_WRITES
   _mount_flakey
   }
 
   rm -f $seqres.full
 
   _scratch_mkfs  $seqres.full 21
   _init_flakey
   _mount_flakey
 
   # Create the test file with some initial data and then fsync it.
   # The fsync here is only needed to trigger the issue in btrfs, as it causes 
 the
   # the flag BTRFS_INODE_NEEDS_FULL_SYNC to be removed from the btrfs inode.
   $XFS_IO_PROG -f -c pwrite -S 0xaa 0 32k \
   -c fsync \
   $SCRATCH_MNT/foo | _filter_xfs_io
   sync
 
   # Add a hard link to our file.
   # On btrfs this sets the flag BTRFS_INODE_COPY_EVERYTHING on the btrfs 
 inode,
   # which is a necessary condition to trigger the issue.
   ln $SCRATCH_MNT/foo $SCRATCH_MNT/bar
 
   # Sync the filesystem to force a commit of the current btrfs transaction, 
 this
   # is a necessary condition to trigger the bug on btrfs.
   sync
 
   # Now append more data to our file, increasing its size, and fsync the file.
   # In btrfs because the inode flag BTRFS_INODE_COPY_EVERYTHING was set and 
 the
   # write path did not update the inode item in the btree nor the delayed 
 inode
   # item (in memory struture) in the current transaction (created by the fsync
   # handler), the fsync did not record the inode's new i_size in the fsync
   # log/journal. This made the data unavailable after the fsync log/journal is
   # replayed.
   $XFS_IO_PROG -c pwrite -S 0xbb 32K 32K \
-c fsync \
$SCRATCH_MNT/foo | _filter_xfs_io
 
   echo File content after fsync and before crash:
   od -t x1 $SCRATCH_MNT/foo
 
   _crash_and_mount
 
   echo File content after crash and log replay:
   od -t x1 $SCRATCH_MNT/foo
 
   status=0
   exit
 
 The expected file output before and after the crash/power failure expects the
 appended data to be available, which is:
 
   000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
   *
   010 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
   *
   020
 
 Cc: sta...@vger.kernel.org
 Signed-off-by: Filipe Manana fdman...@suse.com
 ---
  fs/btrfs/tree-log.c | 14 +-
  1 file changed, 9 insertions(+), 5 deletions(-)
 
 diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
 index d049683..4920fce 100644
 --- a/fs/btrfs/tree-log.c
 +++ b/fs/btrfs/tree-log.c
 @@ -4161,6 +4161,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
 *trans,
   u64 ino = btrfs_ino(inode);
   struct extent_map_tree *em_tree = BTRFS_I(inode)-extent_tree;
   u64 logged_isize = 0;
 + bool need_log_inode_item = true;
  
   path = btrfs_alloc_path();
   if (!path)
 @@ -4269,11 +4270,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
 *trans,
   } else {
   if (inode_only == LOG_INODE_ALL)
   fast_search = true;
 -  

Re: [PATCH] fstests: generic test for fsync after adding xattr to a file

2015-06-18 Thread Filipe Manana
On Thu, Jun 18, 2015 at 4:48 AM, Eryu Guan eg...@redhat.com wrote:
 On Wed, Jun 17, 2015 at 12:52:47PM +0100, fdman...@kernel.org wrote:
 From: Filipe Manana fdman...@suse.com

 This test is motivated by an issue found in btrfs.

 It tests that after syncing the filesystem, adding a xattr to a file,
 syncing the filesystem again, writing to the file and then doing a fsync
 against that file, the xattr still exists after a power failure. That is,
 after the fsync log/journal is replayed, the xattr still exists and with
 the correct value.

 The btrfs issue is fixed by the patch titled:

   Btrfs: fix fsync xattr loss in the fast fsync path

 If I read the above patch correctly, the issue to be tested here was
 introduced by commit 4f764e515361 (Btrfs: remove deleted xattrs on
 fsync log replay), which is in mainline kernel since v4.1-rc1, so the
 test should fail on my v4.1-rc5 kernel, but I didn't see it fail.

 Is it reproducible everytime? Or did I miss something?

It is reproducible everytime, but you'll need my fix for the other
test (Btrfs: fix fsync data loss after append write) in order to
make this one fail. In other words, that other fix just exposes a
problem with commit 4f764e515361 (Btrfs: remove deleted xattrs on
fsync log replay).
Sorry it wasn't well documented. I'll see if I can make this test fail
even without the other fix applied (i.e. current 4.1-rcs).

thanks



 Signed-off-by: Filipe Manana fdman...@suse.com
 ---
  tests/generic/094 | 112 
 ++
  tests/generic/094.out |  29 +
  tests/generic/group   |   1 +
  3 files changed, 142 insertions(+)
  create mode 100755 tests/generic/094
  create mode 100644 tests/generic/094.out

 diff --git a/tests/generic/094 b/tests/generic/094
 new file mode 100755
 index 000..1c6d113
 --- /dev/null
 +++ b/tests/generic/094
 @@ -0,0 +1,112 @@
 +#! /bin/bash
 +# FS QA Test No. 094
 +#
 +# Test that after syncing the filesystem, adding a xattr to a file, syncing
 +# the filesystem again, writing to the file and then doing a fsync against 
 that
 +# file, the xattr still exists after a power failure. That is, after the 
 fsync
 +# log/journal is replayed, the xattr still exists and with the correct 
 value.
 +#
 +# This test is motivated by a bug found in btrfs.
 +#
 +#---
 +# Copyright (C) 2015 SUSE Linux Products GmbH. All Rights Reserved.
 +# Author: Filipe Manana fdman...@suse.com
 +#
 +# This program is free software; you can redistribute it and/or
 +# modify it under the terms of the GNU General Public License as
 +# published by the Free Software Foundation.
 +#
 +# This program is distributed in the hope that it would be useful,
 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 +# GNU General Public License for more details.
 +#
 +# You should have received a copy of the GNU General Public License
 +# along with this program; if not, write the Free Software Foundation,
 +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 +#---
 +#
 +
 +seq=`basename $0`
 +seqres=$RESULT_DIR/$seq
 +echo QA output created by $seq
 +
 +here=`pwd`
 +tmp=/tmp/$$
 +status=1 # failure is the default!
 +
 +_cleanup()
 +{
 + _cleanup_flakey
 + rm -f $tmp.*
 +}
 +trap _cleanup; exit \$status 0 1 2 3 15
 +
 +# get standard environment, filters and checks
 +. ./common/rc
 +. ./common/filter
 +. ./common/dmflakey
 +. ./common/attr
 +
 +# real QA test starts here
 +_supported_fs generic
 +_supported_os Linux
 +_need_to_be_root
 +_require_scratch
 +_require_dm_flakey
 +_require_attrs
 +_require_metadata_journaling $SCRATCH_DEV
 +
 +rm -f $seqres.full
 +
 +_scratch_mkfs  $seqres.full 21
 +_init_flakey
 +_mount_flakey
 +
 +# Create the test file with some initial data and make sure everything is
 +# durably persisted. We do fsync before calling sync to make sure that if 
 the
 +# filesystem is btrfs, we get the flag BTRFS_INODE_NEEDS_FULL_SYNC cleared
 +# from the btrfs inode - a condition necessary to trigger the issue in 
 btrfs.
 +$XFS_IO_PROG -f -c pwrite -S 0xaa 0 32k \
 + -c fsync \
 + $SCRATCH_MNT/foo | _filter_xfs_io
 +sync
 +
 +# Add a xattr to our file.
 +$SETFATTR_PROG -n user.attr -v somevalue $SCRATCH_MNT/foo
 +
 +# Sync the filesystem to force a commit of the current btrfs transaction, 
 this
 +# is a necessary condition to trigger the bug on btrfs.
 +sync
 +
 +# Now update our file's data and fsync the file.
 +# After a successful fsync, if the fsync log/journal is replayed we expect 
 to
 +# see the xattr named user.attr with a value of somevalue (and the 
 updated
 +# file data of course). Btrfs used to remove the xattr when it replayed the
 +# fsync log/journal.
 +$XFS_IO_PROG -c pwrite -S 0xbb 8K 16K \
 + -c fsync \
 

Re: RAID10 Balancing Request for Comments and Advices

2015-06-18 Thread Duncan
Vincent Olivier posted on Wed, 17 Jun 2015 09:46:50 -0400 as excerpted:

 On Jun 16, 2015, at 7:58 PM, Duncan 1i5t5.dun...@cox.net wrote:
 
 Yes.  GlobalReserve is for short-term btrfs-internal use, reserved for
 times when btrfs needs to (temporarily) allocate some space in ordered
 to free space, etc.  It's always single, and you'll rarely see anything
 but 0 used except perhaps in the middle of a balance or something.
 
 
 Get it. Thanks.
 
 Is there anyway to put that on another device, say, a SSD?

Not (AFAIK) presently.  There are various btrfs feature suggestions 
involving selective steering various btrfs component bits to faster or 
slower devices, etc, as can be seen on the wiki, but the btrfs chunk 
allocator isn't really customizable beyond basic raid-level, yet.  It 
does what it does and that's it.  For fancy features such as this, unless 
you're a company or individual with resources to invest in a specific 
feature of interest, I'd say give btrfs development another five years or 
so, and it may be tackling this sort of thing.

The two actually working alternatives I know of are bcached btrfs 
(there's someone on-list that actually does that and reports it working), 
and a more mature btrfs-similar solution such as zfs, tho of course zfs 
on Linux has its own issues, primarily licensing/legal.

 I am thinking
 of backing up this RAID10 on a 2x8TB device-managed SMR RAID1 and I want
 to minimize random write operations (noatime  al.). I will start a new
 thread for that maybe but first, is there something substantial I can
 read about btrfs+SMR? Or should I avoid SMR+btfs ?

I haven't the foggiest, but in case it spares someone looking up SMR like 
I just had to do, SMR = Shingled Magnetic Recording -- the new shingled 
drives that have been in the tech news since shortly before they started 
shipping in late 2013.

https://en.wikipedia.org/wiki/Shingled_magnetic_recording

 ok then, rule of the thumb re-run the scrub on “unverified checksum
 error(s)”. I have yet to see checksum errors yet but will keep it in
 mind..

FWIW, see my few minutes ago reply to Marc MERLIN in the BTRFS: read 
error corrected: ino 1 off  thread, if you're interested in further 
discussion on this.  

But regardless, based on my own experience, that's a good rule of thumb, 
yes. =:^)

 Meanwhile, I'm having a bit of morbid fun watching as [a dying ssd]
 slowly decays, getting experience of the process in a reasonably
 controlled setting without serious danger to my data, since it is
 backed up.

 You sure have morbid inclinations ! ;-)

=:^)

 Out of curiosity what is the frequency and sequence of smartctl
 long/short tests + btrfs scrubs ? Is it all automated ?

I haven't automated any of that, except that since this dying ssd thing 
started I created a small scriptlet (could be an alias, but I prefer 
scriptlets), bscrub, that runs btrfs scrub start -Bd $*, to avoid 
typing in the full command.  All I have to add is the mountpoint to 
scrub, possibly preceded by -r to read-only scrub /, which I keep read-
only mounted by default.

Perhaps to my harm I don't actually do the smart-tests regularly.  I'm 
not actually sure they're particularly useful on SSDs, particularly when 
using checksum-verified and raid-redundant filesystems such as btrfs in 
raid1/10 mode (and raid5/6 as it matures).  In practice btrfs scrub 
regularly reporting error corrected and/or nasty bus reset errors showing 
up in the logs are a pretty good advance indicators, better than smart 
status, from what I've seen.

I do check smartctrl -AH regularly, particularly now, but (in the past at 
least, I think my habit may be changing for the better, now, one of the 
positive results of letting the dying ssd run for the moment) less 
frequently when no problems are evident.

I actually have a pretty firm policy of splitting up my data onto 
separate filesystems (btrfs subvolumes don't cut it for me as all the 
data eggs are still in the same filesystem basket and if its bottom falls 
out, ), keeping them of easily managed and easily backed up size.  My 
largest btrfs is actually under 50 gig.  Between that and the fact that 
I'm using ssds, whole-filesystem maintenance (btrfs scrub, balance, and 
check commands) time is on the order of seconds to a few minutes (single 
digits) per filesystem.  As a result, running them is relatively trivial 
-- it doesn't take the hours to days people report for their multi-
terabyte btrfs on spinning rust, and I can and do sometimes run them on a 
whim.  Scrubs are generally under a minute per filesystem, with only a 
handful of filesystems routinely used, so under 10 minutes, total, 
including repeat-runs, on all routinely mounted btrfs.

Given the trivial time factor I basically simply integrated the scrub 
into my update procedure (weekly on average, tho it can be daily if I'm 
waiting on a fix or 10-14 days if I'm lazy), since that's my biggest 
filesystem changes and thus most likely to trigger new 

Unified output for btrfs? Fwd: Re: [Libguestfs] [PATCH] New API: btrfs_device_stats

2015-06-18 Thread Qu Wenruo

Hi all,

My colleagues are working on adding btrfs support to libguestfs.

But just like  Pino's complain, even in v4.1-rc1 btrfs-progs we reworked 
some UI of mkfs, the output from other btrfs subcommands still have 
their own output format.


Any idea to provide a unified output format for all btrfs commands?
Maybe a json output will be quite good?
Or an independent btrfs user-space library?

Or this is just what higher level program should do?
Just like the btrfs filters in fstests?

Thanks,
Qu

 Forward Message 
Subject: Re: [Libguestfs] [PATCH] New API: btrfs_device_stats
Date: Thu, 18 Jun 2015 10:41:56 +0200
From: Pino Toscano ptosc...@redhat.com
TO: libgues...@redhat.com

Hi,

On Thursday 18 June 2015 11:01:37 Cao jin wrote:

 Speaking of this: you said that you have a colleague working on
 btrfs-progs? What about suggesting to create some machine-parseable
 output (csv, xml, yaml, json, whatever) so extracting the results of
 btrfs tools is a lot more easy?


Yes, I forward your suggestion and consult him, the result is not
surprised:( Here is what I learned from him:
   For the btrfs-progs cmds who output strings, the output are plain,
don`t have patterns. Seen some guys who want a formatted output, they do
a filter by themself, it not reasonable to ask btrfs-progs to output
formatted strings.


This is exactly the issue here: every btrfs command has a totally
different formatting for its output. Just let him take a look at all
the commands implemented so far in libguestfs, you can easily spot
that:
- btrfs subvolume list
- btrfs subvolume show
- btrfs qgroup show
- btrfs balance status
- btrfs scrub status
- btrfs device stats
all have different custom parsers for each of their outputs.

Compare that to e.g. the -m parameter for parted, so it outputs fields
separated by semi-colon.

--
Pino Toscano

___
Libguestfs mailing list
libgues...@redhat.com
https://www.redhat.com/mailman/listinfo/libguestfs



--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] btrfs-progs: Allow filesystem show command to handle different units

2015-06-18 Thread Qu Wenruo
Now filesystem show command can handle different units now.

This is handy for higher level programs to get accurate output from fi
show command.

Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
---
 cmds-filesystem.c | 66 +++
 1 file changed, 52 insertions(+), 14 deletions(-)

diff --git a/cmds-filesystem.c b/cmds-filesystem.c
index dcd3f47..a9dbcfe 100644
--- a/cmds-filesystem.c
+++ b/cmds-filesystem.c
@@ -375,7 +375,7 @@ static void splice_device_list(struct list_head 
*seed_devices,
 }
 
 static void print_devices(struct btrfs_fs_devices *fs_devices,
- u64 *devs_found)
+ u64 *devs_found, unsigned unit_mode)
 {
struct btrfs_device *device;
struct btrfs_fs_devices *cur_fs;
@@ -393,14 +393,16 @@ static void print_devices(struct btrfs_fs_devices 
*fs_devices,
list_for_each_entry(device, all_devices, dev_list) {
printf(\tdevid %4llu size %s used %s path %s\n,
   (unsigned long long)device-devid,
-  pretty_size(device-total_bytes),
-  pretty_size(device-bytes_used), device-name);
+  pretty_size_mode(device-total_bytes, unit_mode),
+  pretty_size_mode(device-bytes_used, unit_mode),
+  device-name);
 
(*devs_found)++;
}
 }
 
-static void print_one_uuid(struct btrfs_fs_devices *fs_devices)
+static void print_one_uuid(struct btrfs_fs_devices *fs_devices,
+  unsigned unit_mode)
 {
char uuidbuf[BTRFS_UUID_UNPARSED_SIZE];
struct btrfs_device *device;
@@ -421,9 +423,9 @@ static void print_one_uuid(struct btrfs_fs_devices 
*fs_devices)
total = device-total_devs;
printf( uuid: %s\n\tTotal devices %llu FS bytes used %s\n, uuidbuf,
   (unsigned long long)total,
-  pretty_size(device-super_bytes_used));
+  pretty_size_mode(device-super_bytes_used, unit_mode));
 
-   print_devices(fs_devices, devs_found);
+   print_devices(fs_devices, devs_found, unit_mode);
 
if (devs_found  total) {
printf(\t*** Some devices missing\n);
@@ -445,7 +447,7 @@ static u64 calc_used_bytes(struct btrfs_ioctl_space_args 
*si)
 static int print_one_fs(struct btrfs_ioctl_fs_info_args *fs_info,
struct btrfs_ioctl_dev_info_args *dev_info,
struct btrfs_ioctl_space_args *space_info,
-   char *label, char *path)
+   char *label, char *path, unsigned unit_mode)
 {
int i;
int fd;
@@ -468,7 +470,8 @@ static int print_one_fs(struct btrfs_ioctl_fs_info_args 
*fs_info,
 
printf( uuid: %s\n\tTotal devices %llu FS bytes used %s\n, uuidbuf,
fs_info-num_devices,
-   pretty_size(calc_used_bytes(space_info)));
+   pretty_size_mode(calc_used_bytes(space_info),
+unit_mode));
 
for (i = 0; i  fs_info-num_devices; i++) {
char *canonical_path;
@@ -485,8 +488,8 @@ static int print_one_fs(struct btrfs_ioctl_fs_info_args 
*fs_info,
canonical_path = canonicalize_path((char *)tmp_dev_info-path);
printf(\tdevid %4llu size %s used %s path %s\n,
tmp_dev_info-devid,
-   pretty_size(tmp_dev_info-total_bytes),
-   pretty_size(tmp_dev_info-bytes_used),
+   pretty_size_mode(tmp_dev_info-total_bytes, unit_mode),
+   pretty_size_mode(tmp_dev_info-bytes_used, unit_mode),
canonical_path);
 
free(canonical_path);
@@ -498,7 +501,7 @@ static int print_one_fs(struct btrfs_ioctl_fs_info_args 
*fs_info,
return 0;
 }
 
-static int btrfs_scan_kernel(void *search)
+static int btrfs_scan_kernel(void *search, unsigned unit_mode)
 {
int ret = 0, fd;
int found = 0;
@@ -537,7 +540,8 @@ static int btrfs_scan_kernel(void *search)
fd = open(mnt-mnt_dir, O_RDONLY);
if ((fd != -1)  !get_df(fd, space_info_arg)) {
print_one_fs(fs_info_arg, dev_info_arg,
-   space_info_arg, label, mnt-mnt_dir);
+space_info_arg, label, mnt-mnt_dir,
+unit_mode);
kfree(space_info_arg);
memset(label, 0, sizeof(label));
found = 1;
@@ -833,6 +837,7 @@ static int cmd_show(int argc, char **argv)
char path[PATH_MAX];
__u8 fsid[BTRFS_FSID_SIZE];
char uuid_buf[BTRFS_UUID_UNPARSED_SIZE];
+   unsigned unit_mode = UNITS_DEFAULT;
int found = 0;
 
while (1) {
@@ -840,6 +845,15 @@ static int cmd_show(int argc, char **argv)
static const struct