[PATCH 3/5] btrfs: fix clone / extent-same deadlocks

2015-06-26 Thread Mark Fasheh
Clone and extent same lock their source and target inodes in opposite order.
In addition to this, the range locking in clone doesn't take ordering into
account. Fix this by having clone use the same locking helpers as
btrfs-extent-same.

In addition, I do a small cleanup of the locking helpers, removing a case
(both inodes being the same) which was poorly accounted for and never
actually used by the callers.

Signed-off-by: Mark Fasheh 
Reviewed-by: David Sterba 
---
 fs/btrfs/ioctl.c | 34 --
 1 file changed, 8 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b899584..8d6887d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2831,8 +2831,7 @@ static void btrfs_double_inode_lock(struct inode *inode1, 
struct inode *inode2)
swap(inode1, inode2);
 
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-   if (inode1 != inode2)
-   mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+   mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
 }
 
 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
@@ -2850,8 +2849,7 @@ static void btrfs_double_extent_lock(struct inode 
*inode1, u64 loff1,
swap(loff1, loff2);
}
lock_extent_range(inode1, loff1, len);
-   if (inode1 != inode2)
-   lock_extent_range(inode2, loff2, len);
+   lock_extent_range(inode2, loff2, len);
 }
 
 struct cmp_pages {
@@ -3713,13 +3711,7 @@ static noinline long btrfs_ioctl_clone(struct file 
*file, unsigned long srcfd,
goto out_fput;
 
if (!same_inode) {
-   if (inode < src) {
-   mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-   mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
-   } else {
-   mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-   mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-   }
+   btrfs_double_inode_lock(src, inode);
} else {
mutex_lock(&src->i_mutex);
}
@@ -3769,8 +3761,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, 
unsigned long srcfd,
 
lock_extent_range(src, lock_start, lock_len);
} else {
-   lock_extent_range(src, off, len);
-   lock_extent_range(inode, destoff, len);
+   btrfs_double_extent_lock(src, off, inode, destoff, len);
}
 
ret = btrfs_clone(src, inode, off, olen, len, destoff);
@@ -3781,9 +3772,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, 
unsigned long srcfd,
 
unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
} else {
-   unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
-   unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
- destoff + len - 1);
+   btrfs_double_extent_unlock(src, off, inode, destoff, len);
}
/*
 * Truncate page cache pages so that future reads will see the cloned
@@ -3792,17 +3781,10 @@ static noinline long btrfs_ioctl_clone(struct file 
*file, unsigned long srcfd,
truncate_inode_pages_range(&inode->i_data, destoff,
   PAGE_CACHE_ALIGN(destoff + len) - 1);
 out_unlock:
-   if (!same_inode) {
-   if (inode < src) {
-   mutex_unlock(&src->i_mutex);
-   mutex_unlock(&inode->i_mutex);
-   } else {
-   mutex_unlock(&inode->i_mutex);
-   mutex_unlock(&src->i_mutex);
-   }
-   } else {
+   if (!same_inode)
+   btrfs_double_inode_unlock(src, inode);
+   else
mutex_unlock(&src->i_mutex);
-   }
 out_fput:
fdput(src_file);
 out_drop_write:
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5] btrfs: allow dedupe of same inode

2015-06-26 Thread Mark Fasheh
clone() supports cloning within an inode so extent-same can do
the same now. This patch fixes up the locking in extent-same to
know about the single-inode case. In addition to that, we add a
check for overlapping ranges, which clone does not allow.

Signed-off-by: Mark Fasheh 
Reviewed-by: David Sterba 
---
 fs/btrfs/ioctl.c | 76 
 1 file changed, 60 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d6887d..83f4679 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2979,27 +2979,61 @@ static int btrfs_extent_same(struct inode *src, u64 
loff, u64 olen,
int ret;
u64 len = olen;
struct cmp_pages cmp;
+   int same_inode = 0;
+   u64 same_lock_start = 0;
+   u64 same_lock_len = 0;
 
-   /*
-* btrfs_clone() can't handle extents in the same file
-* yet. Once that works, we can drop this check and replace it
-* with a check for the same inode, but overlapping extents.
-*/
if (src == dst)
-   return -EINVAL;
+   same_inode = 1;
 
if (len == 0)
return 0;
 
-   btrfs_double_inode_lock(src, dst);
+   if (same_inode) {
+   mutex_lock(&src->i_mutex);
 
-   ret = extent_same_check_offsets(src, loff, &len, olen);
-   if (ret)
-   goto out_unlock;
+   ret = extent_same_check_offsets(src, loff, &len, olen);
+   if (ret)
+   goto out_unlock;
 
-   ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
-   if (ret)
-   goto out_unlock;
+   /*
+* Single inode case wants the same checks, except we
+* don't want our length pushed out past i_size as
+* comparing that data range makes no sense.
+*
+* extent_same_check_offsets() will do this for an
+* unaligned length at i_size, so catch it here and
+* reject the request.
+*
+* This effectively means we require aligned extents
+* for the single-inode case, whereas the other cases
+* allow an unaligned length so long as it ends at
+* i_size.
+*/
+   if (len != olen) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   /* Check for overlapping ranges */
+   if (dst_loff + len > loff && dst_loff < loff + len) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   same_lock_start = min_t(u64, loff, dst_loff);
+   same_lock_len = max_t(u64, loff, dst_loff) + len - 
same_lock_start;
+   } else {
+   btrfs_double_inode_lock(src, dst);
+
+   ret = extent_same_check_offsets(src, loff, &len, olen);
+   if (ret)
+   goto out_unlock;
+
+   ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
+   if (ret)
+   goto out_unlock;
+   }
 
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
@@ -3012,18 +3046,28 @@ static int btrfs_extent_same(struct inode *src, u64 
loff, u64 olen,
if (ret)
goto out_unlock;
 
-   btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+   if (same_inode)
+   lock_extent_range(src, same_lock_start, same_lock_len);
+   else
+   btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
 
/* pass original length for comparison so we stay within i_size */
ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
if (ret == 0)
ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
 
-   btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+   if (same_inode)
+   unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
+ same_lock_start + same_lock_len - 1);
+   else
+   btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
 
btrfs_cmp_data_free(&cmp);
 out_unlock:
-   btrfs_double_inode_unlock(src, dst);
+   if (same_inode)
+   mutex_unlock(&src->i_mutex);
+   else
+   btrfs_double_inode_unlock(src, dst);
 
return ret;
 }
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] btrfs: fix deadlock with extent-same and readpage

2015-06-26 Thread Mark Fasheh
->readpage() does page_lock() before extent_lock(), we do the opposite in
extent-same. We want to reverse the order in btrfs_extent_same() but it's
not quite straightforward since the page locks are taken inside 
btrfs_cmp_data().

So I split btrfs_cmp_data() into 3 parts with a small context structure that
is passed between them. The first, btrfs_cmp_data_prepare() gathers up the
pages needed (taking page lock as required) and puts them on our context
structure. At this point, we are safe to lock the extent range. Afterwards,
we use btrfs_cmp_data() to do the data compare as usual and 
btrfs_cmp_data_free()
to clean up our context.

Signed-off-by: Mark Fasheh 
Reviewed-by: David Sterba 
---
 fs/btrfs/ioctl.c | 148 +++
 1 file changed, 117 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2deea1f..b899584 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2755,14 +2755,11 @@ out:
return ret;
 }
 
-static struct page *extent_same_get_page(struct inode *inode, u64 off)
+static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
 {
struct page *page;
-   pgoff_t index;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 
-   index = off >> PAGE_CACHE_SHIFT;
-
page = grab_cache_page(inode->i_mapping, index);
if (!page)
return NULL;
@@ -2783,6 +2780,20 @@ static struct page *extent_same_get_page(struct inode 
*inode, u64 off)
return page;
 }
 
+static int gather_extent_pages(struct inode *inode, struct page **pages,
+  int num_pages, u64 off)
+{
+   int i;
+   pgoff_t index = off >> PAGE_CACHE_SHIFT;
+
+   for (i = 0; i < num_pages; i++) {
+   pages[i] = extent_same_get_page(inode, index + i);
+   if (!pages[i])
+   return -ENOMEM;
+   }
+   return 0;
+}
+
 static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
 {
/* do any pending delalloc/csum calc on src, one way or
@@ -2808,52 +2819,120 @@ static inline void lock_extent_range(struct inode 
*inode, u64 off, u64 len)
}
 }
 
-static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
-   struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_unlock(struct inode *inode1, struct inode 
*inode2)
 {
-   unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
-   unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-
mutex_unlock(&inode1->i_mutex);
mutex_unlock(&inode2->i_mutex);
 }
 
-static void btrfs_double_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
+{
+   if (inode1 < inode2)
+   swap(inode1, inode2);
+
+   mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+   if (inode1 != inode2)
+   mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+   unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+   unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+struct inode *inode2, u64 loff2, u64 len)
 {
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
}
-
-   mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
lock_extent_range(inode1, loff1, len);
-   if (inode1 != inode2) {
-   mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+   if (inode1 != inode2)
lock_extent_range(inode2, loff2, len);
+}
+
+struct cmp_pages {
+   int num_pages;
+   struct page **src_pages;
+   struct page **dst_pages;
+};
+
+static void btrfs_cmp_data_free(struct cmp_pages *cmp)
+{
+   int i;
+   struct page *pg;
+
+   for (i = 0; i < cmp->num_pages; i++) {
+   pg = cmp->src_pages[i];
+   if (pg)
+   page_cache_release(pg);
+   pg = cmp->dst_pages[i];
+   if (pg)
+   page_cache_release(pg);
+   }
+   kfree(cmp->src_pages);
+   kfree(cmp->dst_pages);
+}
+
+static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
+ struct inode *dst, u64 dst_loff,
+ u64 len, struct cmp_pages *cmp)
+{
+   int ret;
+   int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+   struct page **src_pgarr, **dst_pgarr;
+
+   /*
+* We must gather up all the pages before we init

[PATCH 5/5] btrfs: don't update mtime on deduped inodes

2015-06-26 Thread Mark Fasheh
One issue users have reported is that dedupe changes mtime on files,
resulting in tools like rsync thinking that their contents have changed when
in fact the data is exactly the same. Clone still wants an mtime change, so
we special case this in the code.

This was tested with the btrfs-extent-same tool.

Signed-off-by: Mark Fasheh 
---
 fs/btrfs/ioctl.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 83f4679..0af0f13 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 {
 
 
 static int btrfs_clone(struct inode *src, struct inode *inode,
-  u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+  u64 off, u64 olen, u64 olen_aligned, u64 destoff,
+  int no_mtime);
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -3054,7 +3055,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, 
u64 olen,
/* pass original length for comparison so we stay within i_size */
ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
if (ret == 0)
-   ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
+   ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
 
if (same_inode)
unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
@@ -3219,13 +3220,17 @@ static int clone_finish_inode_update(struct 
btrfs_trans_handle *trans,
 struct inode *inode,
 u64 endoff,
 const u64 destoff,
-const u64 olen)
+const u64 olen,
+int no_mtime)
 {
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
 
inode_inc_iversion(inode);
-   inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+   if (no_mtime)
+   inode->i_ctime = CURRENT_TIME;
+   else
+   inode->i_mtime = inode->i_ctime = CURRENT_TIME;
/*
 * We round up to the block size at eof when determining which
 * extents to clone above, but shouldn't round up the file size.
@@ -3310,13 +3315,13 @@ static void clone_update_extent_map(struct inode *inode,
  * @inode: Inode to clone to
  * @off: Offset within source to start clone from
  * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen, extent_same uses
- *   identical values here
+ * @olen_aligned: Block-aligned value of olen
  * @destoff: Offset within @inode to start clone
+ * @no_mtime: Whether to update mtime on the target inode
  */
 static int btrfs_clone(struct inode *src, struct inode *inode,
   const u64 off, const u64 olen, const u64 olen_aligned,
-  const u64 destoff)
+  const u64 destoff, int no_mtime)
 {
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path = NULL;
@@ -3640,7 +3645,7 @@ process_slot:
  root->sectorsize);
ret = clone_finish_inode_update(trans, inode,
last_dest_end,
-   destoff, olen);
+   destoff, olen, 
no_mtime);
if (ret)
goto out;
if (new_key.offset + datal >= destoff + len)
@@ -3678,7 +3683,7 @@ process_slot:
clone_update_extent_map(inode, trans, NULL, last_dest_end,
destoff + len - last_dest_end);
ret = clone_finish_inode_update(trans, inode, destoff + len,
-   destoff, olen);
+   destoff, olen, no_mtime);
}
 
 out:
@@ -3808,7 +3813,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, 
unsigned long srcfd,
btrfs_double_extent_lock(src, off, inode, destoff, len);
}
 
-   ret = btrfs_clone(src, inode, off, olen, len, destoff);
+   ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
 
if (same_inode) {
u64 lock_start = min_t(u64, off, destoff);
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/5] btrfs: dedupe fixes, features V4

2015-06-26 Thread Mark Fasheh
Hi Chris,

   The following patches are based on top of my patch titled "btrfs:
Handle unaligned length in extent_same" which you have in your
'integration-4.2' branch:

https://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?id=e1d227a42ea2b4664f94212bd1106b9a3413ffb8

The series contains three fixes and two features.

The first patch in the series fixes a bug where we were sometimes
passing the aligned length to our comparison function. We actually can
stop at the user passed length for this as we don't need to compare
data past i_size (and we only align if the extents go out to i_size).

The 2nd patch fixes a deadlock between btrfs readpage and
btrfs_extent_same. This was reported on the list some months ago -
basically we had the page and extent locking reversed. My patch fixes
up the locking to be in the right order.

The 3rd patch fixes a deadlocks in clone() (wrt extent-same) which
David found while reviewing my fixes. I also found that clone doesn't
lock extent ranges in any particular order which could obvioulsy be a
problem so that is fixed there too.

The last two patches add features which have been requested often by
users - the 4th adds the ability to dedupe within the same inode, and
the last patch fixes up the clone code to skip updating mtime when
called from extent-same (this helps with backup software which is
being confused into re-backing up deduped files).

Changes from V3 to V4:
  - change mtime patch from a flag to default behavior

Changes from V2 to V3:
  - better flag naming in patch #5, thanks to David

Changes from V1 to V2:
  - added patches 3-5

Thanks,
   --Mark
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] btrfs: pass unaligned length to btrfs_cmp_data()

2015-06-26 Thread Mark Fasheh
In the case that we dedupe the tail of a file, we might expand the dedupe
len out to the end of our last block. We don't want to compare data past
i_size however, so pass the original length to btrfs_cmp_data().

Signed-off-by: Mark Fasheh 
Reviewed-by: David Sterba 
---
 fs/btrfs/ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2d24ff4..2deea1f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2933,7 +2933,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, 
u64 olen,
goto out_unlock;
}
 
-   ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+   /* pass original length for comparison so we stay within i_size */
+   ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen);
if (ret == 0)
ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
 
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit

2015-06-26 Thread Lutz Vieweg

Testing the patch took much longer than I anticipated due to pre-4.1-kernels
being "too risky" for use on our servers, but now it's done and I can say:

This patch, as integrated in linux-4.1, has successfully removed the lags.

Thanks!

Regards,

Lutz Vieweg


On 04/22/2015 06:09 PM, Lutz Vieweg wrote:

On 04/13/2015 09:52 PM, Chris Mason wrote:

Large filesystems with lots of block groups can suffer long stalls during
commit while we create and send down all of the block group caches.  The
more blocks groups dirtied in a transaction, the longer these stalls can be.
Some workloads average 10 seconds per commit, but see peak times much higher.


Since we see this problem very frequently on some shared development servers,
I will try to install this ASAP.

Meanwhile, can anybody already tell success stories about successfully removing
lags by this patch?

Regards,

Lutz Vieweg

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


btrfs filesystem defrag fails with "Resource temporarily unavailable" when strace'd

2015-06-26 Thread Lutz Vieweg

I started...
  btrfs filesystem defragment -v /some/directory
and this took (unexpectedly, for just 150M of data) a long time (minutes).

So I wondered whether I could see via "strace -p PID" what was going on with 
the process.

Immediately when strace tries to attach, the btrfs process dies with this error 
message:


ERROR: defrag failed on /some/directory - Resource temporarily unavailable


This is 100% reproducable.

The strace output looks like this:

Process 1402 attached
brk(0)  = 0x1468000
brk(0)  = 0x1468000
brk(0x146)  = 0x146
brk(0)  = 0x146
close(3)= 0
write(2, "ERROR: defrag failed on /some/d"..., 84) = 84
write(1, "btrfs-progs v4.1\n", 17)  = 17
write(2, "total 1 failures\n", 17)  = 17
exit_group(1)   = ?
+++ exited with 1 +++


This is the first time I see a prozess crash upon strace being attached -
is this intended behaviour? If so... why?

(Tried again without strace, "the btrfs filesystem defragment" command later
finished ok, albeight after a loong time.)

Regards,

Lutz Vieweg

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


fuser fails to list processes that have a directory on a btrfs filesystem as their current working directory

2015-06-26 Thread Lutz Vieweg

JFYI: Not a btrfs bug, but a btrfs-specific misbehaviour of "fuser" that
might easily be misinterpreted as a btrfs bug:

http://bugzilla.centos.org/view.php?id=8966


/usr/sbin/fuser (from psmisc-22.20-8.el7.x86_64) does not list any processes 
that have a directory on a btrfs file system as their current working directory.

(For file systems other than btrfs, this works.)



Steps To Reproduce:
mkdir /some/btrfs/testdir
cd /some/btrfs/testdir
fuser -v -m /some/btrfs

This should list your current shell as one of the processes using the 
/some/btrfs file system, but it doesn't

The same on /some/xfs/testdir works fine.



This bug is not present in upstream "psmisc" sources as of current master in
git://git.code.sf.net/p/psmisc/code

An fuser executable compiled from the current upstream sources works fine also 
for btrfs.


Regards,

Lutz Vieweg

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs-progs: Add nbytes output for print-tree and reformat inode output

2015-06-26 Thread David Sterba
On Fri, Jun 19, 2015 at 11:49:19AM +0800, Qu Wenruo wrote:
> The original implement doesn't output nbytes in btrfs_inode.
> Add the output and since the output is too long, reformat it to multi
> lines.
> 
> This is very handy to debug related bugs.
> 
> Signed-off-by: Qu Wenruo 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs-progs: alias btrfs device delete to btrfs device remove

2015-06-26 Thread David Sterba
On Wed, Jun 24, 2015 at 09:09:17AM -0700, Omar Sandoval wrote:
> There's an awkward asymmetry between btrfs device add and btrfs device
> delete. Resolve this by aliasing delete to remove.
> 
> Signed-off-by: Omar Sandoval 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs-progs: replace struct cmd_group->hidden with flags

2015-06-26 Thread David Sterba
On Wed, Jun 24, 2015 at 09:09:16AM -0700, Omar Sandoval wrote:
> We're also going to want to support aliases, so rather than adding
> another member, replace "hidden" with a "flags" member.
> 
> Signed-off-by: Omar Sandoval 

Fixed compilation breakage and applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: inspect: Fix out of bounds string termination.

2015-06-26 Thread David Sterba
On Fri, Jun 26, 2015 at 10:43:56AM +0200, Patrik Lundquist wrote:
> Signed-off-by: Patrik Lundquist 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] Btrfs

2015-06-26 Thread David Sterba
On Tue, Jun 02, 2015 at 10:02:49AM -0400, Josh Boyer wrote:
> >>> Chris Mason (1) commits (+18/-0):
> >>> Btrfs: fix regression in raid level conversion
> >>
> >> Shouldn't this be CC'd to stable since it fixes a broken commit in 4.0?
> >
> > Yes, I'm retesting two of these against 4.0
> 
> Did you finish up testing the two commits?  I might have missed it,
> but I haven't seen anything being requested to the 4.0.y stable tree
> yet.

JFYI, fixed in 4.0.6 .
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: Improve FL_KEEP_SIZE handling in fallocate.

2015-06-26 Thread David Sterba
On Mon, Apr 06, 2015 at 10:09:15PM -0700, Davide Italiano wrote:
> - We call inode_size_ok() only if FL_KEEP_SIZE isn't specified.
> - As an optimisation we can skip the call if (off + len)
>   isn't greater than the current size of the file. This operation
>   is called under the lock so the less work we do, the better.
> - If we call inode_size_ok() pass to it the correct value rather
>   than a more conservative estimation.
> 
> Signed-off-by: Davide Italiano 

Reviewed-by: David Sterba 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Automatic balance after mkfs?

2015-06-26 Thread David Sterba
On Thu, Jun 18, 2015 at 01:41:07PM +0800, Qu Wenruo wrote:
> >>> Yes. It's an artefact of the way that mkfs works. If you run a
> >>> balance on those chunks, they'll go away. (btrfs balance start
> >>> -dusage=0 -musage=0 /mountpoint)
> >>
> >> Since I had to explain this very same thing to a new btrfs-using friend
> >> just yesterday I wondered if it might not make sense for mkfs to issue
> >> a general balance after creating the fs? It should be simple enough
> >> (just issue the balance ioctl?) and not have any negative side effects.
> >>
> >> Just doing such a post-mkfs cleanup automatically would certainly
> >> reduce the number of times we have to explain the this. :)
> >>
> >> Any reasons why we couldn't/shouldn't do this?
> >>
> > Following the same line of thinking, is there any reason we couldn't
> > just rewrite mkfs to get rid of this legacy behavior?

The 'single' blockgroups on multidevice filesystem are considered a bug
in mkfs, an annoying and long running one.

> Compared to the more complex auto balance, rewrite mkfs is a much better 
> idea.

Balance is a workaround besides that it requires mouting.

> The original mkfs seems easy for developers, but bad for users.

I'd argue that mkfs is primarily for users.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] btrfs device remove alias

2015-06-26 Thread David Sterba
On Fri, Jun 26, 2015 at 09:10:56AM +0800, Anand Jain wrote:
> while on this. its also good idea to create alias for
> 
>btrfs replace start -> btrfs device replace.

This was asked for back then, and briefly discussed on irc (11/2012).
The preference was not to do too much typing, although the command
hierarchy would become less structured and can cause some trouble when
looking for docs.

I'm slightly worried about adding more aliases as it can cause confusion
when writing documentaiton and recommending how to do things.  But I
understand the motivations to make the interface more consistent or
convenient to use.

In this case it's moving replace to the expected place. I'm not against
it but more feedback would help.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: i_version vs iversion (Was: Re: [RFC PATCH v2 1/2] Btrfs: add noi_version option to disable MS_I_VERSION)

2015-06-26 Thread J. Bruce Fields
On Thu, Jun 25, 2015 at 06:12:57PM -0400, Theodore Ts'o wrote:
> On Thu, Jun 25, 2015 at 02:46:44PM -0400, J. Bruce Fields wrote:
> > Looks OK to me.  As I say I'd expect i_version_seen == true to end up
> > being the common case in a lot of v4 workloads, so I'm more skeptical of
> > the claim of a performance improvement in the v4 case.
> 
> Well, so long as we require i_version to be committed to disk on every
> single disk write, we're going to be trading off:
> 
>   * client-side performance of the advanced NFSv4 cacheing for reads
>   * server-side performance for writes
>   * data robustness in case of the server crashing and the client-side 
> cache
> getting out of sync with the server after the crash
> 
> I don't see any way around that.  (So for example, with lazy mtime
> updates we wouldn't be updating the inode after every single
> non-allocating write; enabling i_version updates will trash that
> optimization.)
> 
> I just want to reduce to a bare minimum the performance hit in the
> case where NFSv4 exports are not being used (since that is true in a
> very *large* number of ext4 deployments --- i.e., every single Android
> handset using ext4 :-), such that we can leave i_version updates
> turned on by default.

Definitely understood.  I think it's a good idea.

> > Could maintaining the new flag be a significant drag in itself?  If not,
> > then I guess we're not making things any worse there, so fine.
> 
> I don't think so; it's a bit in the in-memory inode, so I don't think
> that should be an issue.

OK!

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: fix fsync after truncate when no_holes feature is enabled

2015-06-26 Thread Liu Bo
On Fri, Jun 26, 2015 at 09:48:05AM +0100, Filipe Manana wrote:
> On Fri, Jun 26, 2015 at 4:21 AM, Liu Bo  wrote:
> > On Thu, Jun 25, 2015 at 03:23:59PM +0100, Filipe Manana wrote:
> >> On Thu, Jun 25, 2015 at 3:20 PM, Liu Bo  wrote:
> >> > On Thu, Jun 25, 2015 at 04:17:46AM +0100, fdman...@kernel.org wrote:
> >> >> From: Filipe Manana 
> >> >>
> >> >> When we have the no_holes feature enabled, if a we truncate a file to a
> >> >> smaller size, truncate it again but to a size greater than or equals to
> >> >> its original size and fsync it, the log tree will not have any 
> >> >> information
> >> >> about the hole covering the range [truncate_1_offset, new_file_size[.
> >> >> Which means if the fsync log is replayed, the file will remain with the
> >> >> state it had before both truncate operations.
> >> >
> >> > Does the fs/subvol tree get updated to the right information at this
> >> > time?
> >>
> >> No, and that's the problem. Because no file extent items are stored in
> >> the log tree.
> >> The inode item is updated with the new i_size however (as expected).
> >
> > Yeap, that's right and the patch looks right.
> >
> > I do appreciate your great work on fixing these corner cases,
> > but as of my understanding, they really can be taken by a force commit
> > transaction, do they deserve these complex stuff?
> 
> All the "complexity" is not about avoiding a transaction commit and do
> something more efficient instead, but rather about detecting such
> case. The simpler way of for example always force a commit if the file
> has the full sync flag (or a new flag set on truncate) and was created
> in a past generation, would be quite ironic given all the
> optimizations effort put into fsync, no-holes feature, etc.

I agree.

> 
> >
> > After all, like punch_hole, remove xattr, they're rare cases.
> 
> Maybe not as much as one would think.
> Having worked on databases before, hole punching, truncations and
> other "rare" cases aren't that rare and correct fsync behaviour is a
> must or the very least desirable.

That makes sense.

Forgot to give

Reviewed-by: Liu Bo 

Thanks,

-liubo

> 
> thanks
> 
> >
> > Thanks,
> >
> > -liubo
> >
> >>
> >> thanks
> >>
> >> >
> >> > Thanks,
> >> >
> >> > -liubo
> >> >>
> >> >> Without the no_holes feature this does not happen, since when the inode
> >> >> is logged (full sync flag is set) it will find in the fs/subvol tree a
> >> >> leaf with a generation matching the current transaction id that has an
> >> >> explicit extent item representing the hole.
> >> >>
> >> >> Fix this by adding an explicit extent item representing a hole between
> >> >> the last extent and the inode's i_size if we are doing a full sync.
> >> >>
> >> >> The issue is easy to reproduce with the following test case for fstests:
> >> >>
> >> >>   . ./common/rc
> >> >>   . ./common/filter
> >> >>   . ./common/dmflakey
> >> >>
> >> >>   _need_to_be_root
> >> >>   _supported_fs generic
> >> >>   _supported_os Linux
> >> >>   _require_scratch
> >> >>   _require_dm_flakey
> >> >>
> >> >>   # This test was motivated by an issue found in btrfs when the btrfs
> >> >>   # no-holes feature is enabled (introduced in kernel 3.14). So enable
> >> >>   # the feature if the fs being tested is btrfs.
> >> >>   if [ $FSTYP == "btrfs" ]; then
> >> >>   _require_btrfs_fs_feature "no_holes"
> >> >>   _require_btrfs_mkfs_feature "no-holes"
> >> >>   MKFS_OPTIONS="$MKFS_OPTIONS -O no-holes"
> >> >>   fi
> >> >>
> >> >>   rm -f $seqres.full
> >> >>
> >> >>   _scratch_mkfs >>$seqres.full 2>&1
> >> >>   _init_flakey
> >> >>   _mount_flakey
> >> >>
> >> >>   # Create our test files and make sure everything is durably persisted.
> >> >>   $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 64K" \
> >> >>   -c "pwrite -S 0xbb 64K 61K"   \
> >> >>   $SCRATCH_MNT/foo | _filter_xfs_io
> >> >>   $XFS_IO_PROG -f -c "pwrite -S 0xee 0 64K" \
> >> >>   -c "pwrite -S 0xff 64K 61K"   \
> >> >>   $SCRATCH_MNT/bar | _filter_xfs_io
> >> >>   sync
> >> >>
> >> >>   # Now truncate our file foo to a smaller size (64Kb) and then truncate
> >> >>   # it to the size it had before the shrinking truncate (125Kb). Then
> >> >>   # fsync our file. If a power failure happens after the fsync, we 
> >> >> expect
> >> >>   # our file to have a size of 125Kb, with the first 64Kb of data having
> >> >>   # the value 0xaa and the second 61Kb of data having the value 0x00.
> >> >>   $XFS_IO_PROG -c "truncate 64K" \
> >> >>-c "truncate 125K" \
> >> >>-c "fsync" \
> >> >>$SCRATCH_MNT/foo
> >> >>
> >> >>   # Do something similar to our file bar, but the first truncation sets
> >> >>   # the file size to 0 and the second truncation expands the size to the
> >> >>   # double of what it was initially.
> >> >>   $XFS_IO_PROG -c "truncate 0" \
> >> >>-c "truncate 253K" \
> >> >>-c "fsync" \
> >> >>

Re: [RFC PATCH V11 02/21] Btrfs: subpagesize-blocksize: Fix whole page write.

2015-06-26 Thread Liu Bo
On Mon, Jun 01, 2015 at 08:52:37PM +0530, Chandan Rajendra wrote:
> For the subpagesize-blocksize scenario, a page can contain multiple
> blocks. In such cases, this patch handles writing data to files.
> 
> Also, When setting EXTENT_DELALLOC, we no longer set EXTENT_UPTODATE bit on
> the extent_io_tree since uptodate status is being tracked by the bitmap
> pointed to by page->private.

To be honestly, I'm not sure why we set EXTENT_UPTODATE bit for data as we
don't check for that bit at all for now, correct me if I'm wrong.

> 
> Signed-off-by: Chandan Rajendra 
> ---
>  fs/btrfs/extent_io.c | 141 
> +++
>  fs/btrfs/file.c  |  16 ++
>  fs/btrfs/inode.c |  58 -
>  3 files changed, 125 insertions(+), 90 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index d37badb..3736ab5 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1283,9 +1283,8 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 
> start, u64 end,
>  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
>   struct extent_state **cached_state, gfp_t mask)
>  {
> - return set_extent_bit(tree, start, end,
> -   EXTENT_DELALLOC | EXTENT_UPTODATE,
> -   NULL, cached_state, mask);
> + return set_extent_bit(tree, start, end, EXTENT_DELALLOC,
> + NULL, cached_state, mask);
>  }
>  
>  int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
> @@ -1498,25 +1497,6 @@ int extent_range_redirty_for_io(struct inode *inode, 
> u64 start, u64 end)
>   return 0;
>  }
>  
> -/*
> - * helper function to set both pages and extents in the tree writeback
> - */
> -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 
> end)
> -{
> - unsigned long index = start >> PAGE_CACHE_SHIFT;
> - unsigned long end_index = end >> PAGE_CACHE_SHIFT;
> - struct page *page;
> -
> - while (index <= end_index) {
> - page = find_get_page(tree->mapping, index);
> - BUG_ON(!page); /* Pages should be in the extent_io_tree */
> - set_page_writeback(page);
> - page_cache_release(page);
> - index++;
> - }
> - return 0;
> -}
> -
>  /* find the first state struct with 'bits' set after 'start', and
>   * return it.  tree->lock must be held.  NULL will returned if
>   * nothing was found after 'start'
> @@ -2080,6 +2060,14 @@ static int page_read_complete(struct page *page)
>   return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
>  }
>  
> +static int page_write_complete(struct page *page)
> +{
> + u64 start = page_offset(page);
> + u64 end = start + PAGE_CACHE_SIZE - 1;
> +
> + return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
> +}
> +
>  int free_io_failure(struct inode *inode, struct io_failure_record *rec)
>  {
>   int ret;
> @@ -2575,38 +2563,37 @@ int end_extent_writepage(struct page *page, int err, 
> u64 start, u64 end)
>   */
>  static void end_bio_extent_writepage(struct bio *bio, int err)
>  {
> + struct btrfs_page_private *pg_private;
>   struct bio_vec *bvec;
> + unsigned long flags;
>   u64 start;
>   u64 end;
> + int clear_writeback;
>   int i;
>  
>   bio_for_each_segment_all(bvec, bio, i) {
>   struct page *page = bvec->bv_page;
>  
> - /* We always issue full-page reads, but if some block
> -  * in a page fails to read, blk_update_request() will
> -  * advance bv_offset and adjust bv_len to compensate.
> -  * Print a warning for nonzero offsets, and an error
> -  * if they don't add up to a full page.  */
> - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
> - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
> - 
> btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
> -"partial page write in btrfs with offset %u 
> and length %u",
> - bvec->bv_offset, bvec->bv_len);
> - else
> - 
> btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
> -"incomplete page write in btrfs with offset 
> %u and "
> -"length %u",
> - bvec->bv_offset, bvec->bv_len);
> - }
> + start = page_offset(page) + bvec->bv_offset;
> + end = start + bvec->bv_len - 1;
>  
> - start = page_offset(page);
> - end = start + bvec->bv_offset + bvec->bv_len - 1;
> + pg_private = (struct btrfs_page_private *)page->private;
> +
> + spin_lock_irqsave(&pg_private->io_lock, flags);
>  
> - if (end_extent_writepage(page, err, s

Re: [PATCH] Btrfs: fix fsync after truncate when no_holes feature is enabled

2015-06-26 Thread Filipe Manana
On Fri, Jun 26, 2015 at 4:21 AM, Liu Bo  wrote:
> On Thu, Jun 25, 2015 at 03:23:59PM +0100, Filipe Manana wrote:
>> On Thu, Jun 25, 2015 at 3:20 PM, Liu Bo  wrote:
>> > On Thu, Jun 25, 2015 at 04:17:46AM +0100, fdman...@kernel.org wrote:
>> >> From: Filipe Manana 
>> >>
>> >> When we have the no_holes feature enabled, if a we truncate a file to a
>> >> smaller size, truncate it again but to a size greater than or equals to
>> >> its original size and fsync it, the log tree will not have any information
>> >> about the hole covering the range [truncate_1_offset, new_file_size[.
>> >> Which means if the fsync log is replayed, the file will remain with the
>> >> state it had before both truncate operations.
>> >
>> > Does the fs/subvol tree get updated to the right information at this
>> > time?
>>
>> No, and that's the problem. Because no file extent items are stored in
>> the log tree.
>> The inode item is updated with the new i_size however (as expected).
>
> Yeap, that's right and the patch looks right.
>
> I do appreciate your great work on fixing these corner cases,
> but as of my understanding, they really can be taken by a force commit
> transaction, do they deserve these complex stuff?

All the "complexity" is not about avoiding a transaction commit and do
something more efficient instead, but rather about detecting such
case. The simpler way of for example always force a commit if the file
has the full sync flag (or a new flag set on truncate) and was created
in a past generation, would be quite ironic given all the
optimizations effort put into fsync, no-holes feature, etc.

>
> After all, like punch_hole, remove xattr, they're rare cases.

Maybe not as much as one would think.
Having worked on databases before, hole punching, truncations and
other "rare" cases aren't that rare and correct fsync behaviour is a
must or the very least desirable.

thanks

>
> Thanks,
>
> -liubo
>
>>
>> thanks
>>
>> >
>> > Thanks,
>> >
>> > -liubo
>> >>
>> >> Without the no_holes feature this does not happen, since when the inode
>> >> is logged (full sync flag is set) it will find in the fs/subvol tree a
>> >> leaf with a generation matching the current transaction id that has an
>> >> explicit extent item representing the hole.
>> >>
>> >> Fix this by adding an explicit extent item representing a hole between
>> >> the last extent and the inode's i_size if we are doing a full sync.
>> >>
>> >> The issue is easy to reproduce with the following test case for fstests:
>> >>
>> >>   . ./common/rc
>> >>   . ./common/filter
>> >>   . ./common/dmflakey
>> >>
>> >>   _need_to_be_root
>> >>   _supported_fs generic
>> >>   _supported_os Linux
>> >>   _require_scratch
>> >>   _require_dm_flakey
>> >>
>> >>   # This test was motivated by an issue found in btrfs when the btrfs
>> >>   # no-holes feature is enabled (introduced in kernel 3.14). So enable
>> >>   # the feature if the fs being tested is btrfs.
>> >>   if [ $FSTYP == "btrfs" ]; then
>> >>   _require_btrfs_fs_feature "no_holes"
>> >>   _require_btrfs_mkfs_feature "no-holes"
>> >>   MKFS_OPTIONS="$MKFS_OPTIONS -O no-holes"
>> >>   fi
>> >>
>> >>   rm -f $seqres.full
>> >>
>> >>   _scratch_mkfs >>$seqres.full 2>&1
>> >>   _init_flakey
>> >>   _mount_flakey
>> >>
>> >>   # Create our test files and make sure everything is durably persisted.
>> >>   $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 64K" \
>> >>   -c "pwrite -S 0xbb 64K 61K"   \
>> >>   $SCRATCH_MNT/foo | _filter_xfs_io
>> >>   $XFS_IO_PROG -f -c "pwrite -S 0xee 0 64K" \
>> >>   -c "pwrite -S 0xff 64K 61K"   \
>> >>   $SCRATCH_MNT/bar | _filter_xfs_io
>> >>   sync
>> >>
>> >>   # Now truncate our file foo to a smaller size (64Kb) and then truncate
>> >>   # it to the size it had before the shrinking truncate (125Kb). Then
>> >>   # fsync our file. If a power failure happens after the fsync, we expect
>> >>   # our file to have a size of 125Kb, with the first 64Kb of data having
>> >>   # the value 0xaa and the second 61Kb of data having the value 0x00.
>> >>   $XFS_IO_PROG -c "truncate 64K" \
>> >>-c "truncate 125K" \
>> >>-c "fsync" \
>> >>$SCRATCH_MNT/foo
>> >>
>> >>   # Do something similar to our file bar, but the first truncation sets
>> >>   # the file size to 0 and the second truncation expands the size to the
>> >>   # double of what it was initially.
>> >>   $XFS_IO_PROG -c "truncate 0" \
>> >>-c "truncate 253K" \
>> >>-c "fsync" \
>> >>$SCRATCH_MNT/bar
>> >>
>> >>   _load_flakey_table $FLAKEY_DROP_WRITES
>> >>   _unmount_flakey
>> >>
>> >>   # Allow writes again, mount to trigger log replay and validate file
>> >>   # contents.
>> >>   _load_flakey_table $FLAKEY_ALLOW_WRITES
>> >>   _mount_flakey
>> >>
>> >>   # We expect foo to have a size of 125Kb, the first 64Kb of data all
>> >>   # having the value 0xaa and the r

[PATCH] btrfs-progs: inspect: Fix out of bounds string termination.

2015-06-26 Thread Patrik Lundquist
Signed-off-by: Patrik Lundquist 
---
 cmds-inspect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmds-inspect.c b/cmds-inspect.c
index 053cf8e..aafe37d 100644
--- a/cmds-inspect.c
+++ b/cmds-inspect.c
@@ -293,7 +293,7 @@ static int cmd_subvolid_resolve(int argc, char **argv)
goto out;
}
 
-   path[PATH_MAX] = '\0';
+   path[PATH_MAX-1] = '\0';
printf("%s\n", path);
 
 out:
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] fstests: generic test for fsync after file truncations

2015-06-26 Thread fdmanana
From: Filipe Manana 

Test that if we truncate a file to a smaller size, then truncate it to
its original size or a larger size, then fsyncing it and a power failure
happens, the file will have the range [first_truncate_size, last_size[
with all bytes having a value of 0x00 if we read it the next time the
filesystem is mounted.

This test is motivated by a bug found in btrfs, which is fixed by a patch
titled: "Btrfs: fix fsync after truncate when no_holes feature is enabled"

Tested against ext3/4, xfs, btrfs (with and without the fix, and with the
no_holes feature disabled), f2fs, reiserfs and nilfs2.

All filesystems pass the test except for unpatched btrfs with the
no_holes feature enabled (as expected) and f2fs. Both produce the
following file contents that differ from the golden output:

  File foo content after log replay:
  000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
  *
  020 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
  *
  0372000
  File bar content after log replay:
  000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
  *
  020 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
  *
  0372000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  *
  0772000

Signed-off-by: Filipe Manana 
Reviewed-by: Eryu Guan 
---

V2: Added missing call to _require_metadata_journaling and Eryu's
reviewed-by tag.

 tests/generic/095 | 117 ++
 tests/generic/095.out |  19 
 tests/generic/group   |   1 +
 3 files changed, 137 insertions(+)
 create mode 100755 tests/generic/095
 create mode 100644 tests/generic/095.out

diff --git a/tests/generic/095 b/tests/generic/095
new file mode 100755
index 000..95f8b44
--- /dev/null
+++ b/tests/generic/095
@@ -0,0 +1,117 @@
+#! /bin/bash
+# FSQA Test No. 095
+#
+# Test that if we truncate a file to a smaller size, then truncate it to its
+# original size or a larger size, then fsyncing it and a power failure happens,
+# the file will have the range [first_truncate_size, last_size[ with all bytes
+# having a value of 0x00 if we read it the next time the filesystem is mounted.
+#
+# This test is motivated by a bug found in btrfs.
+#
+#---
+#
+# Copyright (C) 2015 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_need_to_be_root
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_dm_flakey
+_require_metadata_journaling $SCRATCH_DEV
+
+# This test was motivated by an issue found in btrfs when the btrfs no-holes
+# feature is enabled (introduced in kernel 3.14). So enable the feature if the
+# fs being tested is btrfs.
+if [ $FSTYP == "btrfs" ]; then
+   _require_btrfs_fs_feature "no_holes"
+   _require_btrfs_mkfs_feature "no-holes"
+   MKFS_OPTIONS="$MKFS_OPTIONS -O no-holes"
+fi
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_init_flakey
+_mount_flakey
+
+# Create our test files and make sure everything is durably persisted.
+$XFS_IO_PROG -f -c "pwrite -S 0xaa 0 64K" \
+   -c "pwrite -S 0xbb 64K 61K"   \
+   $SCRATCH_MNT/foo | _filter_xfs_io
+$XFS_IO_PROG -f -c "pwrite -S 0xee 0 64K" \
+   -c "pwrite -S 0xff 64K 61K"   \
+   $SCRATCH_MNT/bar | _filter_xfs_io
+sync
+
+# Now truncate our file foo to a smaller size (64Kb) and then truncate it to 
the
+# size it had before the shrinking truncate (125Kb). Then fsync our file. If a
+# power failure happens after the fsync, we expect our file to have a size of
+# 125Kb, with the first 64Kb of data having the value 0xaa and the second 61Kb
+# of data having the value 0x00.
+$XFS_IO_PROG -c "truncate 64K" \
+   -c "truncate 125K" \
+   -c "fsync" \
+   $SCRATCH_MNT/foo
+
+# Do something similar to our file bar, but the first truncati