date:20120704

[PATCH] Btrfs: Add code to support file creation time.

2012-07-04 Thread chandan r

This patch adds a new member to the 'struct btrfs_inode' structure to hold
the file creation time.

Signed-off-by: chandan 
---
 fs/btrfs/btrfs_inode.h   |  3 +++
 fs/btrfs/ctree.h |  8 
 fs/btrfs/delayed-inode.c | 10 +-
 fs/btrfs/inode.c | 25 ++---
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 12394a9..b761456 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -162,6 +162,9 @@ struct btrfs_inode {
 
struct btrfs_delayed_node *delayed_node;
 
+   /* File creation time. */
+   struct timespec i_otime;
+
struct inode vfs_inode;
 };
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa5c45b..4ce172f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1816,6 +1816,14 @@ btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
return (struct btrfs_timespec *)ptr;
 }
 
+static inline struct btrfs_timespec *
+btrfs_inode_otime(struct btrfs_inode_item *inode_item)
+{
+   unsigned long ptr = (unsigned long)inode_item;
+   ptr += offsetof(struct btrfs_inode_item, otime);
+   return (struct btrfs_timespec *)ptr;
+}
+
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 21d91a8..63726967 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1742,6 +1742,11 @@ static void fill_stack_inode_item(struct 
btrfs_trans_handle *trans,
 inode->i_ctime.tv_sec);
btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
  inode->i_ctime.tv_nsec);
+
+   btrfs_set_stack_timespec_sec(btrfs_inode_otime(inode_item),
+   BTRFS_I(inode)->i_otime.tv_sec);
+   btrfs_set_stack_timespec_nsec(btrfs_inode_otime(inode_item),
+   BTRFS_I(inode)->i_otime.tv_nsec);
 }
 
 int btrfs_fill_inode(struct inode *inode, u32 *rdev)
@@ -1787,6 +1792,10 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
 
+   tspec = btrfs_inode_otime(inode_item);
+   BTRFS_I(inode)->i_otime.tv_sec = btrfs_stack_timespec_sec(tspec);
+   BTRFS_I(inode)->i_otime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+
inode->i_generation = BTRFS_I(inode)->generation;
BTRFS_I(inode)->index_cnt = (u64)-1;
 
@@ -1912,4 +1921,3 @@ void btrfs_destroy_delayed_inodes(struct btrfs_root *root)
btrfs_release_delayed_node(prev_node);
}
 }
-
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0d507e6..145a2ed 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2581,6 +2581,10 @@ static void btrfs_read_locked_inode(struct inode *inode)
inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
 
+   tspec = btrfs_inode_otime(inode_item);
+   BTRFS_I(inode)->i_otime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+   BTRFS_I(inode)->i_otime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
inode->i_version = btrfs_inode_sequence(leaf, inode_item);
@@ -2665,6 +2669,11 @@ static void fill_inode_item(struct btrfs_trans_handle 
*trans,
btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
inode->i_ctime.tv_nsec);
 
+   btrfs_set_timespec_sec(leaf, btrfs_inode_otime(item),
+   BTRFS_I(inode)->i_otime.tv_sec);
+   btrfs_set_timespec_nsec(leaf, btrfs_inode_otime(item),
+   BTRFS_I(inode)->i_otime.tv_nsec);
+
btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
btrfs_set_inode_sequence(leaf, item, inode->i_version);
@@ -2846,7 +2855,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
}
return ret;
 }
-   
+
 
 /* helper to check if there is any shared block in the path */
 static int check_path_shared(struct btrfs_root *root,
@@ -4151,7 +4160,11 @@ static struct inode *new_simple_dir(struct super_block 
*s,
inode->i_op = &btrfs_dir_ro_inode_operations;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
-   inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+   BTRFS_I(inode)->i_otime
+   = inode->i_mtime
+   = inode->i_atime
+   = inode->i_ctime
+   = CURRENT_TIME;
 
return inode;
 }
@@ -4687,7 +4700,11 @@ st

Re: [PATCH] Btrfs: Add code to support file creation time.

2012-07-04 Thread Li Zefan

On 2012/7/4 15:18, chandan r wrote:

> This patch adds a new member to the 'struct btrfs_inode' structure to hold
> the file creation time.
> 


Well, how do users use this file creation time? There's no syscall and there's
no ioctl that exports this information. That xstat syscall hasn't been accepted,
so you can revise and repost the patch when you see it happens.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: Add code to support file creation time.

2012-07-04 Thread Alexander Block

On Wed, Jul 4, 2012 at 9:56 AM, Li Zefan  wrote:
> On 2012/7/4 15:18, chandan r wrote:
>
>> This patch adds a new member to the 'struct btrfs_inode' structure to hold
>> the file creation time.
>>
>
>
> Well, how do users use this file creation time? There's no syscall and there's
> no ioctl that exports this information. That xstat syscall hasn't been 
> accepted,
> so you can revise and repost the patch when you see it happens.
In my opinion we should still include this patch. Currently, otime is never even
initialized, having undefined values. If it ever gets possible to
access otime, we
would at least have some inodes with valid otime fields.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: btrfs GPF in read_extent_buffer() while scrubbing with kernel 3.4.2

2012-07-04 Thread Jan Schmidt

On 04.07.2012 02:17, Sami Liedes wrote:
> On Wed, Jul 04, 2012 at 01:47:56AM +0300, Sami Liedes wrote:
>> I've seen this before: An overly long "Modules linked in:" line causes
>> a large gap in netconsole output.
> 
> I managed to capture the entire output using netconsole by modifying
> the kernel to not output the list of modules.

Okay, thanks for the output. Can you please apply the patch below and capture
especially the line printed before the "cut here" line?

Thanks!
-Jan

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c9018a0..beabe99 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4519,7 +4519,14 @@ void read_extent_buffer(struct extent_buffer *eb, void 
*dstv,
size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
 
-   WARN_ON(start > eb->len);
+   if (start > eb->len) {
+   printk(KERN_ERR "btrfs: invalid parameters for 
read_extent_buffer: start (%lu) > eb->len (%lu). eb start is %llu, level %d, 
generation %llu, nritems %d. len param %lu. debug %llu/%llu/%llu/%llu\n",
+   start, eb->len, eb->start, btrfs_header_level(eb),
+   btrfs_header_generation(eb), btrfs_header_nritems(eb),
+   len,
+   eb->debug[0], eb->debug[1], eb->debug[2], eb->debug[3]);
+   WARN_ON(1);
+   }
WARN_ON(start + len > eb->start + eb->len);
 
offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b516c3b..1bbf823 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -164,6 +164,8 @@ struct extent_buffer {
wait_queue_head_t lock_wq;
struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
struct page **pages;
+
+   u64 debug[4];
 };
 
 static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ac5d010..d9c1146 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -168,10 +168,15 @@ static int __readahead_hook(struct btrfs_root *root, 
struct extent_buffer *eb,
struct btrfs_key key;
struct btrfs_key next_key;
 
+   eb->debug[0] = 1;
+   eb->debug[1] = i;
+   eb->debug[2] = nritems;
+   eb->debug[3] = generation;
btrfs_node_key_to_cpu(eb, &key, i);
-   if (i + 1 < nritems)
+   if (i + 1 < nritems) {
+   eb->debug[0] = 2;
btrfs_node_key_to_cpu(eb, &next_key, i + 1);
-   else
+   } else
next_key = re->top;
bytenr = btrfs_node_blockptr(eb, i);
n_gen = btrfs_node_ptr_generation(eb, i);
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 0/7] Experimental btrfs send/receive (kernel side)

2012-07-04 Thread Alexander Block

Hello all,

This patchset introduces the btrfs send ioctl, which creates a stream
of instructions that can later be replayed to reconstruct the sent
subvolumes/snapshots. Patches for btrfs-progs will follow in a separate
patchset.

Some of you may remember the previous discussions on send/receive. The
original plan was to use ustar/pax as container for the stream, which
was a good format at the beginning as we planned to store extents and
other data as if they were normal files so that btrfs receive could
unpack them correctly to the right places. The advantage was that you
could unpack it with tar and use the contents by hand to some degree.

The type of the stream however has changed to some kind of instructions
stream, as this was the easiest way to handle moves, deletes and 
overwrites corretly. If this stream was stored in ustar/pax format,
we would have no advantages compared to a custom stream format. So I
dropped the ustar/pax format in the middle of development. I may add
a new mode for the ioctl (or a new ioctl?) later that emits the plain 
diff of the parent root and the root to send, instead of instructions.
This could then be used to do something like what was planned at the
beginning. It could also have other uses too. But that's for later.

The stream now consists of millions of create/rename/link/write/clone/
chmod/... instructions which only need to be replayed. No kernel
support is required to replay the stream. The only exception is the
BTRFS_IOC_SET_RECEIVED_SUBVOL call that is performed when btrfs 
receive is done.

btrfs send/receive currently only works on read-only snapshots. There
are ideas in my head floating around to make sending of r/w subvolumes
possible too, but this is for later.

We support full and incremental sending of subvolumes/snapshots.
The ioctl expects an optional list of "clone sources" and an optional
"parent root". The clone sources tell the kernel which subvolumes
can be used to accept clones from when processing file extents. The
parent root tells the kernel which root should be used for the
incremental send. Internally, it does a tree compare between the
send root and the parent root to find the differences. If no parent
is specified, the full tree is sent. The parent root is implicitely
added to the clone sources by btrfs-progs.  The parent root is also 
used for the initial snapshot operation on the receiving side. If no 
parent was specified to brtfs-progs, it will try to find a good one in 
the list of clone sources. This will however only work for snapshots
that were created with this patchset applied (due to the uuid+times
patch). Older snapshots miss parent information and you'll need to
specify a parent by hand.

If you used reflinks or the experimental dedup (found on the list)
before, you will need working cross subvolume reflinks on the
receiving side. The send ioctl tries hard to avoid emitting cross
subvolume reflinks if that is possible, but there is no guarantee
for this. If you specify clone sources by hand, there is also a
high chance that cross subvolume clones are emitted. In general,
I tend to see cross subvolume reflinks as a requirement for btrfs
send/receive.

*WARNING* *WARNING* *WARNING* *WARNING*
btrfs send/receive is experimental. The main usage for send/receive
in the future will probably be backups. If you use it for backups,
you're taking big risks and may end up with unusable backups. Please
do not only count on btrfs send/receive backups!

If you still want to use it, make sure the backups are working and
100% correct. I for example used rsync in dry run mode to ensure
that a stream was received correctly. Simply receive the just sent
Here is the command line that
I used for it:

rsync -aAXvnc --delete /origin/subvol/ /backup-target/subvol/

The -c flag is the most important here, don't remove it just to make
rsync faster. btrfs receive restores the file times 1:1, so rsync
may consider differing files as equal when it doesn't compare by
checksum. If rsync ever prints a file or directory in its output,
you have found a bug in btrfs send/receive. Please report this.

Also, the output format of btrfs send may not be final. I'll try 
hard to not change it too much and keep compability, but as this is a 
very early version, I can't guarantee anything. So please, don't store
the send streams with the assumption that you can still receive them
in a year.

You've been warned...

*END OF WARNING*

Big thanks go to Arne Jansen, David Sterba and Jan Schmidt (sorted by
first name) who helped me a lot with their assistance in IRC and the 
reviews done by them. The code however still needs a lot of review
and testing, so feel welcome to do so :)

You can pick and apply the patches by hand if you want. Don't
forget to also apply the required patches mentioned below. As an
alternative, here is my git repo containing all required patches:

git://github.com/ablock84/linux-btrfs.git (branch send)

The branch is based on 3.5-rc5. I had to split t

[RFC PATCH 1/7] Btrfs: use _IOR for BTRFS_IOC_SUBVOL_GETFLAGS

2012-07-04 Thread Alexander Block

We used the wrong ioctl macro for the getflags ioctl before.
As we don't have the set/getflags ioctls in the user space ioctl.h
at the moment, it's safe to fix it now.

Reviewed-by: David Sterba 
Signed-off-by: Alexander Block 
---
 fs/btrfs/ioctl.h |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 497c530..e440aa6 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -339,7 +339,7 @@ struct btrfs_ioctl_get_dev_stats {
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
   struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
 #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
 #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
  struct btrfs_ioctl_scrub_args)
-- 
1.7.10

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 4/7] Btrfs: introduce subvol uuids and times

2012-07-04 Thread Alexander Block

This patch introduces uuids for subvolumes. Each
subvolume has it's own uuid. In case it was snapshotted,
it also contains parent_uuid. In case it was received,
it also contains received_uuid.

It also introduces subvolume ctime/otime/stime/rtime. The
first two are comparable to the times found in inodes. otime
is the origin/creation time and ctime is the change time.
stime/rtime are only valid on received subvolumes.
stime is the time of the subvolume when it was
sent. rtime is the time of the subvolume when it was
received.

Additionally to the times, we have a transid for each
time. They are updated at the same place as the times.

btrfs receive uses stransid and rtransid to find out
if a received subvolume changed in the meantime.

If an older kernel mounts a filesystem with the
extented fields, all fields become invalid. The next
mount with a new kernel will detect this and reset the
fields.

Signed-off-by: Alexander Block 
---
 fs/btrfs/ctree.h   |   43 ++
 fs/btrfs/disk-io.c |2 +
 fs/btrfs/inode.c   |4 ++
 fs/btrfs/ioctl.c   |   96 ++--
 fs/btrfs/ioctl.h   |   13 +++
 fs/btrfs/root-tree.c   |   92 +++---
 fs/btrfs/transaction.c |   17 +
 7 files changed, 258 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8cfde93..2bd5df8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -709,6 +709,35 @@ struct btrfs_root_item {
struct btrfs_disk_key drop_progress;
u8 drop_level;
u8 level;
+
+   /*
+* The following fields appear after subvol_uuids+subvol_times
+* were introduced.
+*/
+
+   /*
+* This generation number is used to test if the new fields are valid
+* and up to date while reading the root item. Everytime the root item
+* is written out, the "generation" field is copied into this field. If
+* anyone ever mounted the fs with an older kernel, we will have
+* mismatching generation values here and thus must invalidate the
+* new fields. See btrfs_update_root and btrfs_find_last_root for
+* details.
+* the offset of generation_v2 is also used as the start for the memset
+* when invalidating the fields.
+*/
+   __le64 generation_v2;
+   u8 uuid[BTRFS_UUID_SIZE];
+   u8 parent_uuid[BTRFS_UUID_SIZE];
+   u8 received_uuid[BTRFS_UUID_SIZE];
+   __le64 ctransid; /* updated when an inode changes */
+   __le64 otransid; /* trans when created */
+   __le64 stransid; /* trans when sent. non-zero for received subvol */
+   __le64 rtransid; /* trans when received. non-zero for received subvol */
+   struct btrfs_timespec ctime;
+   struct btrfs_timespec otime;
+   struct btrfs_timespec stime;
+   struct btrfs_timespec rtime;
 } __attribute__ ((__packed__));
 
 /*
@@ -1416,6 +1445,8 @@ struct btrfs_root {
dev_t anon_dev;
 
int force_cow;
+
+   spinlock_t root_times_lock;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -2189,6 +2220,16 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct 
btrfs_root_item, bytes_used, 64);
 BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
 last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
+ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
+otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
+stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
+rtransid, 64);
 
 static inline bool btrfs_root_readonly(struct btrfs_root *root)
 {
@@ -2829,6 +2870,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 void btrfs_set_root_node(struct btrfs_root_item *item,
 struct extent_buffer *node);
 void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+struct btrfs_root *root);
 
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7b845ff..d3b49ad 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1182,6 +1182,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root->defrag_running = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
+
+   spin_lock_init(&root->root_times_lock);
 }
 
 static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/inode.c b/fs/

[RFC PATCH 3/7] Btrfs: make iref_to_path non static

2012-07-04 Thread Alexander Block

Make iref_to_path non static (needed in send) and rename
it to btrfs_iref_to_path

Signed-off-by: Alexander Block 
---
 fs/btrfs/backref.c |   10 +-
 fs/btrfs/backref.h |4 
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7301cdb..f642d28 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1122,10 +1122,10 @@ static int inode_ref_info(u64 inum, u64 ioff, struct 
btrfs_root *fs_root,
  * required for the path to fit into the buffer. in that case, the returned
  * value will be smaller than dest. callers must check this!
  */
-static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
-   struct btrfs_inode_ref *iref,
-   struct extent_buffer *eb_in, u64 parent,
-   char *dest, u32 size)
+char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+struct btrfs_inode_ref *iref,
+struct extent_buffer *eb_in, u64 parent,
+char *dest, u32 size)
 {
u32 len;
int slot;
@@ -1540,7 +1540,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref 
*iref,
ipath->fspath->bytes_left - s_ptr : 0;
 
fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
-   fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
+   fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
inum, fspath_min, bytes_left);
if (IS_ERR(fspath))
return PTR_ERR(fspath);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index c18d8ac..1a76579 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -21,6 +21,7 @@
 
 #include "ioctl.h"
 #include "ulist.h"
+#include "extent_io.h"
 
 #define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
 
@@ -60,6 +61,9 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 delayed_ref_seq, u64 time_seq,
struct ulist **roots);
+char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+struct btrfs_inode_ref *iref, struct extent_buffer *eb,
+u64 parent, char *dest, u32 size);
 
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
-- 
1.7.10

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 2/7] Btrfs: add helper for tree enumeration

2012-07-04 Thread Alexander Block

From: Arne Jansen 

Often no exact match is wanted but just the next lower or
higher item. There's a lot of duplicated code throughout
btrfs to deal with the corner cases. This patch adds a
helper function that can facilitate searching.

Signed-off-by: Arne Jansen 
---
 fs/btrfs/ctree.c |   74 ++
 fs/btrfs/ctree.h |3 +++
 2 files changed, 77 insertions(+)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 15cbc2b..33c8a03 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2724,6 +2724,80 @@ done:
 }
 
 /*
+ * helper to use instead of search slot if no exact match is needed but
+ * instead the next or previous item should be returned.
+ * When find_higher is true, the next higher item is returned, the next lower
+ * otherwise.
+ * When return_any and find_higher are both true, and no higher item is found,
+ * return the next lower instead.
+ * When return_any is true and find_higher is false, and no lower item is 
found,
+ * return the next higher instead.
+ * It returns 0 if any item is found, 1 if none is found (tree empty), and
+ * < 0 on error
+ */
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+  struct btrfs_key *key, struct btrfs_path *p,
+  int find_higher, int return_any)
+{
+   int ret;
+   struct extent_buffer *leaf;
+
+again:
+   ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
+   if (ret <= 0)
+   return ret;
+   /*
+* a return value of 1 means the path is at the position where the
+* item should be inserted. Normally this is the next bigger item,
+* but in case the previous item is the last in a leaf, path points
+* to the first free slot in the previous leaf, i.e. at an invalid
+* item.
+*/
+   leaf = p->nodes[0];
+
+   if (find_higher) {
+   if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+   ret = btrfs_next_leaf(root, p);
+   if (ret <= 0)
+   return ret;
+   if (!return_any)
+   return 1;
+   /*
+* no higher item found, return the next
+* lower instead
+*/
+   return_any = 0;
+   find_higher = 0;
+   btrfs_release_path(p);
+   goto again;
+   }
+   } else {
+   if (p->slots[0] == 0) {
+   ret = btrfs_prev_leaf(root, p);
+   if (ret < 0)
+   return ret;
+   if (!ret) {
+   p->slots[0] = btrfs_header_nritems(leaf) - 1;
+   return 0;
+   }
+   if (!return_any)
+   return 1;
+   /*
+* no lower item found, return the next
+* higher instead
+*/
+   return_any = 0;
+   find_higher = 1;
+   btrfs_release_path(p);
+   goto again;
+   } else {
+   --p->slots[0];
+   }
+   }
+   return 0;
+}
+
+/*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
  * This is used after shifting pointers to the left, so it stops
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa5c45b..8cfde93 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2711,6 +2711,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, 
struct btrfs_root
  ins_len, int cow);
 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
  struct btrfs_path *p, u64 time_seq);
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+  struct btrfs_key *key, struct btrfs_path *p,
+  int find_higher, int return_any);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
   struct btrfs_root *root, struct extent_buffer *parent,
   int start_slot, int cache_only, u64 *last_ret,
-- 
1.7.10

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 5/7] Btrfs: add btrfs_compare_trees function

2012-07-04 Thread Alexander Block

This function is used to find the differences between
two trees. The tree compare skips whole subtrees if it
detects shared tree blocks and thus is pretty fast.

Signed-off-by: Alexander Block 
---
 fs/btrfs/ctree.c |  425 ++
 fs/btrfs/ctree.h |   15 ++
 2 files changed, 440 insertions(+)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 33c8a03..d1c7efd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5007,6 +5007,431 @@ out:
return ret;
 }
 
+static void tree_move_down(struct btrfs_root *root,
+  struct btrfs_path *path,
+  int *level, int root_level)
+{
+   path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
+   path->slots[*level]);
+   path->slots[*level - 1] = 0;
+   (*level)--;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_root *root,
+   struct btrfs_path *path,
+   int *level, int root_level)
+{
+   int ret = 0;
+   int nritems;
+   nritems = btrfs_header_nritems(path->nodes[*level]);
+
+   path->slots[*level]++;
+
+   while (path->slots[*level] == nritems) {
+   if (*level == root_level)
+   return -1;
+
+   /* move upnext */
+   path->slots[*level] = 0;
+   free_extent_buffer(path->nodes[*level]);
+   path->nodes[*level] = NULL;
+   (*level)++;
+   path->slots[*level]++;
+
+   nritems = btrfs_header_nritems(path->nodes[*level]);
+   ret = 1;
+   }
+   return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_root *root,
+   struct btrfs_path *path,
+   int *level, int root_level,
+   int allow_down,
+   struct btrfs_key *key)
+{
+   int ret;
+
+   if (*level == 0 || !allow_down) {
+   ret = tree_move_next_or_upnext(root, path, level, root_level);
+   } else {
+   tree_move_down(root, path, level, root_level);
+   ret = 0;
+   }
+   if (ret >= 0) {
+   if (*level == 0)
+   btrfs_item_key_to_cpu(path->nodes[*level], key,
+   path->slots[*level]);
+   else
+   btrfs_node_key_to_cpu(path->nodes[*level], key,
+   path->slots[*level]);
+   }
+   return ret;
+}
+
+static int tree_compare_item(struct btrfs_root *left_root,
+struct btrfs_path *left_path,
+struct btrfs_path *right_path,
+char *tmp_buf)
+{
+   int cmp;
+   int len1, len2;
+   unsigned long off1, off2;
+
+   len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+   len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+   if (len1 != len2)
+   return 1;
+
+   off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+   off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+   right_path->slots[0]);
+
+   read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+   cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+   if (cmp)
+   return 1;
+   return 0;
+}
+
+#define ADVANCE 1
+#define ADVANCE_ONLY_NEXT -1
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+int btrfs_compare_trees(struct btrfs_root *left_root,
+   struct btrfs_root *right_root,
+   btrfs_changed_cb_t changed_cb, void *ctx)
+{
+   int ret;
+   int cmp;
+   struct btrfs_trans_handle *trans = NULL;
+   struct btrfs_path *left_path = NULL;
+   struct btrfs_path *right_path = NULL;
+   struct btrfs_key left_key;
+   struct btrfs_key right_key;
+   char *tmp_buf = NULL;
+   int left_root_level;
+   int right_root_level;
+   int left_level;
+   int right_level;
+   int left_end_reached;
+   int right_end_reached;
+   int advance_left;
+   int advance_right;
+   u64 left_block

[RFC PATCH 6/7] Btrfs: introduce BTRFS_IOC_SEND for btrfs send/receive (part 1)

2012-07-04 Thread Alexander Block

This patch introduces the BTRFS_IOC_SEND ioctl that is
required for send. It allows btrfs-progs to implement
full and incremental sends. Patches for btrfs-progs will
follow.

I had to split the patch as it got larger then 100k which is
the limit for the mailing list. The first part only contains
the send.h header and the helper functions for TLV handling
and long path name handling and some other helpers. The second
part contains the actual send logic from send.c

Signed-off-by: Alexander Block 
---
 fs/btrfs/Makefile |2 +-
 fs/btrfs/ioctl.h  |   10 +
 fs/btrfs/send.c   | 1009 +
 fs/btrfs/send.h   |  126 +++
 4 files changed, 1146 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/send.c
 create mode 100644 fs/btrfs/send.h

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0c4fa2b..f740644 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-  reada.o backref.o ulist.o
+  reada.o backref.o ulist.o send.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c9e3fac..282bc64 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -304,6 +304,15 @@ struct btrfs_ioctl_received_subvol_args {
__u64   reserved[16];
 };
 
+struct btrfs_ioctl_send_args {
+   __s64 send_fd;  /* in */
+   __u64 clone_sources_count;  /* in */
+   __u64 __user *clone_sources;/* in */
+   __u64 parent_root;  /* in */
+   __u64 flags;/* in */
+   __u64 reserved[4];  /* in */
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -371,6 +380,7 @@ struct btrfs_ioctl_received_subvol_args {
 
 #define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
struct btrfs_ioctl_received_subvol_args)
+#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct 
btrfs_ioctl_send_args)
 
 #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
  struct btrfs_ioctl_get_dev_stats)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
new file mode 100644
index 000..47a2557
--- /dev/null
+++ b/fs/btrfs/send.c
@@ -0,0 +1,1009 @@
+/*
+ * Copyright (C) 2012 Alexander Block.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "send.h"
+#include "backref.h"
+#include "locking.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+
+static int g_verbose = 0;
+
+#define verbose_printk(...) if (g_verbose) printk(__VA_ARGS__)
+
+/*
+ * A fs_path is a helper to dynamically build path names with unknown size.
+ * It reallocates the internal buffer on demand.
+ * It allows fast adding of path elements on the right side (normal path) and
+ * fast adding to the left side (reversed path). A reversed path can also be
+ * unreversed if needed.
+ */
+struct fs_path {
+   union {
+   struct {
+   char *start;
+   char *end;
+   char *prepared;
+
+   char *buf;
+   int buf_len;
+   int reversed:1;
+   int virtual_mem:1;
+   char inline_buf[];
+   };
+   char pad[PAGE_SIZE];
+   };
+};
+#define FS_PATH_INLINE_SIZE \
+   (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
+
+
+/* reused for each extent */
+struct clone_root {
+   struct btrfs_root *root;
+   u64 ino;
+   u64 offset;
+
+   u64 found_refs;
+};
+
+#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
+#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
+
+struct send_ctx {
+   struct file *se

[RFC PATCH 0/6] Experimental btrfs send/receive (btrfs-progs)

2012-07-04 Thread Alexander Block

Hello all,

This is the user space side of btrfs send/receive.

You can apply them manually or use my git repo:

git://github.com/ablock84/btrfs-progs.git (branch send)

The branch is based on Hugo's integration-20120605 branch. I had to add a 
temporary
commit to fix a bug introduced in one of the strncpy/overflow patches that got 
into
btrfs-progs. This fix is not part of the btrfs send/receive patchset, but you'll
probably need it if you want to base on the integration branch. I hope this is 
not
required in the future when a new integration branch comes out.

Example usage:

Multiple snapshots at once:
btrfs send /mnt/snap[123] > snap123.btrfs

Single snapshot with manual parent:
btrfs send -p /mnt/snap3 /mnt/snap4 > snap4.btrfs

Receive both streams:
btrfs receive /mnt2 < snap123.btrfs
btrfs receive /mnt2 < snap4.btrfs

(Please give suggestions for a file extension)

Please read the kernel side email as well, especially the warnings!

Alex.

Alexander Block (6):
  Btrfs-progs: add BTRFS_IOC_SUBVOL_GET/SETFLAGS to ioctl.h
  Btrfs-progs: update ioctl.h to support clone range ioctl
  Btrfs-progs: print inode transid and dir item data field in
debug-tree
  Btrfs-progs: update btrfs-progs for subvol uuid+times support
  Btrfs-progs: update ioctl.h to support btrfs send ioctl
  Btrfs-progs: add btrfs send/receive commands

 Makefile   |7 +-
 btrfs.c|2 +
 cmds-receive.c |  910 
 cmds-send.c|  677 +
 commands.h |4 +
 ctree.h|   40 ++-
 ioctl.h|   35 ++-
 print-tree.c   |   88 --
 send-stream.c  |  480 ++
 send-stream.h  |   58 
 send-utils.c   |  337 +
 send-utils.h   |   69 +
 send.h |  132 
 13 files changed, 2815 insertions(+), 24 deletions(-)
 create mode 100644 cmds-receive.c
 create mode 100644 cmds-send.c
 create mode 100644 send-stream.c
 create mode 100644 send-stream.h
 create mode 100644 send-utils.c
 create mode 100644 send-utils.h
 create mode 100644 send.h

-- 
1.7.10

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 1/6] Btrfs-progs: add BTRFS_IOC_SUBVOL_GET/SETFLAGS to ioctl.h

2012-07-04 Thread Alexander Block

Btrfs send/receive and btrfs props needs this ioctl. This patch
requires a recent kernel with the "Btrfs: use _IOR for
BTRFS_IOC_SUBVOL_GETFLAGS" patch applied.

Signed-off-by: Alexander Block 
---
 ioctl.h |2 ++
 1 file changed, 2 insertions(+)

diff --git a/ioctl.h b/ioctl.h
index f2e5d8d..6670e08 100644
--- a/ioctl.h
+++ b/ioctl.h
@@ -312,6 +312,8 @@ struct btrfs_ioctl_logical_ino_args {
struct btrfs_ioctl_space_args)
 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
 #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
struct btrfs_ioctl_scrub_args)
 #define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
-- 
1.7.10

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 2/6] Btrfs-progs: update ioctl.h to support clone range ioctl

2012-07-04 Thread Alexander Block

Added missing btrfs_ioctl_clone_range_args and BTRFS_IOC_CLONE_RANGE
to ioctl.h

Signed-off-by: Alexander Block 
---
 ioctl.h |   10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ioctl.h b/ioctl.h
index 6670e08..023ca4c 100644
--- a/ioctl.h
+++ b/ioctl.h
@@ -281,6 +281,13 @@ struct btrfs_ioctl_logical_ino_args {
   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
   struct btrfs_ioctl_vol_args)
+
+struct btrfs_ioctl_clone_range_args {
+   __s64 src_fd;
+   __u64 src_offset, src_length;
+   __u64 dest_offset;
+};
+
 /* trans start and trans end are dangerous, and only for
  * use by applications that know how to avoid the
  * resulting deadlocks
@@ -296,7 +303,8 @@ struct btrfs_ioctl_logical_ino_args {
   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
   struct btrfs_ioctl_vol_args)
-/* 13 is for CLONE_RANGE */
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+  struct btrfs_ioctl_clone_range_args)
 #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
-- 
1.7.10

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 3/6] Btrfs-progs: print inode transid and dir item data field in debug-tree

2012-07-04 Thread Alexander Block

Add printing of inode transid and dir item data field.

Signed-off-by: Alexander Block 
---
 print-tree.c |9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/print-tree.c b/print-tree.c
index fc134c0..1377732 100644
--- a/print-tree.c
+++ b/print-tree.c
@@ -48,6 +48,12 @@ static int print_dir_item(struct extent_buffer *eb, struct 
btrfs_item *item,
read_extent_buffer(eb, namebuf, (unsigned long)(di + 1), len);
printf("\t\tnamelen %u datalen %u name: %.*s\n",
   name_len, data_len, len, namebuf);
+   if (data_len) {
+   len = (data_len <= sizeof(namebuf))? data_len: 
sizeof(namebuf);
+   read_extent_buffer(eb, namebuf,
+   (unsigned long)(di + 1) + name_len, len);
+   printf("\t\tdata %.*s\n", len, namebuf);
+   }
len = sizeof(*di) + name_len + data_len;
di = (struct btrfs_dir_item *)((char *)di + len);
cur += len;
@@ -481,8 +487,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct 
extent_buffer *l)
switch (type) {
case BTRFS_INODE_ITEM_KEY:
ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
-   printf("\t\tinode generation %llu size %llu block group 
%llu mode %o links %u\n",
+   printf("\t\tinode generation %llu transid %llu size 
%llu block group %llu mode %o links %u\n",
   (unsigned long long)btrfs_inode_generation(l, 
ii),
+  (unsigned long long)btrfs_inode_transid(l, ii),
   (unsigned long long)btrfs_inode_size(l, ii),
   (unsigned long 
long)btrfs_inode_block_group(l,ii),
   btrfs_inode_mode(l, ii),
-- 
1.7.10

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 4/6] Btrfs-progs: update btrfs-progs for subvol uuid+times support

2012-07-04 Thread Alexander Block

Update ctree.h and ioctl.h for the new uuid+times for subvolumes.

Signed-off-by: Alexander Block 
---
 ctree.h  |   40 -
 ioctl.h  |   12 +
 print-tree.c |   79 +-
 3 files changed, 112 insertions(+), 19 deletions(-)

diff --git a/ctree.h b/ctree.h
index 254fb0b..07691c7 100644
--- a/ctree.h
+++ b/ctree.h
@@ -642,6 +642,35 @@ struct btrfs_root_item {
struct btrfs_disk_key drop_progress;
u8 drop_level;
u8 level;
+
+   /*
+* The following fields appear after subvol_uuids+subvol_times
+* were introduced.
+*/
+
+   /*
+* This generation number is used to test if the new fields are valid
+* and up to date while reading the root item. Everytime the root item
+* is written out, the "generation" field is copied into this field. If
+* anyone ever mounted the fs with an older kernel, we will have
+* mismatching generation values here and thus must invalidate the
+* new fields. See btrfs_update_root and btrfs_find_last_root for
+* details.
+* the offset of generation_v2 is also used as the start for the memset
+* when invalidating the fields.
+*/
+   __le64 generation_v2;
+   u8 uuid[BTRFS_UUID_SIZE];
+   u8 parent_uuid[BTRFS_UUID_SIZE];
+   u8 received_uuid[BTRFS_UUID_SIZE];
+   __le64 ctransid; /* updated when an inode changes */
+   __le64 otransid; /* trans when created */
+   __le64 stransid; /* trans when sent. non-zero for received subvol */
+   __le64 rtransid; /* trans when received. non-zero for received subvol */
+   struct btrfs_timespec ctime;
+   struct btrfs_timespec otime;
+   struct btrfs_timespec stime;
+   struct btrfs_timespec rtime;
 } __attribute__ ((__packed__));
 
 /*
@@ -1607,7 +1636,16 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct 
btrfs_root_item, bytes_used, 64);
 BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
 last_snapshot, 64);
-
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
+ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
+otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
+stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
+rtransid, 64);
 
 /* struct btrfs_root_backup */
 BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
diff --git a/ioctl.h b/ioctl.h
index 023ca4c..77503e6 100644
--- a/ioctl.h
+++ b/ioctl.h
@@ -20,6 +20,7 @@
 #define __IOCTL_
 #include 
 #include 
+#include 
 
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
@@ -272,6 +273,15 @@ struct btrfs_ioctl_logical_ino_args {
__u64   inodes;
 };
 
+struct btrfs_ioctl_received_subvol_args {
+   charuuid[BTRFS_UUID_SIZE];  /* in */
+   __u64   stransid;   /* in */
+   __u64   rtransid;   /* out */
+   struct timespec stime;  /* in */
+   struct timespec rtime;  /* out */
+   __u64   reserved[16];
+};
+
 /* BTRFS_IOC_SNAP_CREATE is no longer used by the btrfs command */
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
   struct btrfs_ioctl_vol_args)
@@ -341,4 +351,6 @@ struct btrfs_ioctl_clone_range_args {
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
struct btrfs_ioctl_ino_path_args)
 
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+   struct btrfs_ioctl_received_subvol_args)
 #endif
diff --git a/print-tree.c b/print-tree.c
index 1377732..d9f669a 100644
--- a/print-tree.c
+++ b/print-tree.c
@@ -282,6 +282,66 @@ static void print_root_ref(struct extent_buffer *leaf, int 
slot, char *tag)
   namelen, namebuf);
 }
 
+static int count_bytes(void *buf, int len, char b)
+{
+   int cnt = 0;
+   int i;
+   for (i = 0; i < len; i++) {
+   if (((char*)buf)[i] == b)
+   cnt++;
+   }
+   return cnt;
+}
+
+static void print_root(struct extent_buffer *leaf, int slot)
+{
+   struct btrfs_root_item *ri;
+   struct btrfs_root_item root_item;
+   int len;
+   char uuid_str[128];
+
+   ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
+   len = btrfs_item_size_nr(leaf, slot);
+
+   memset(&root_item, 0, sizeof(root_item));
+   read_extent_buffer(leaf, &root_item, (unsigned long)ri, len);
+
+   printf("\t\troot data bytenr %llu level %d dirid %llu

[RFC PATCH 5/6] Btrfs-progs: update ioctl.h to support btrfs send ioctl

2012-07-04 Thread Alexander Block

Add btrfs_ioctl_send_args and BTRFS_IOC_SEND to ioctl.h

Signed-off-by: Alexander Block 
---
 ioctl.h |   11 +++
 1 file changed, 11 insertions(+)

diff --git a/ioctl.h b/ioctl.h
index 77503e6..d35710c 100644
--- a/ioctl.h
+++ b/ioctl.h
@@ -282,6 +282,15 @@ struct btrfs_ioctl_received_subvol_args {
__u64   reserved[16];
 };
 
+struct btrfs_ioctl_send_args {
+   __s64 send_fd;  /* in */
+   __u64 clone_sources_count;  /* in */
+   __u64 *clone_sources;   /* in */
+   __u64 parent_root;  /* in */
+   __u64 flags;/* in */
+   __u64 reserved[4];  /* in */
+};
+
 /* BTRFS_IOC_SNAP_CREATE is no longer used by the btrfs command */
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
   struct btrfs_ioctl_vol_args)
@@ -353,4 +362,6 @@ struct btrfs_ioctl_clone_range_args {
 
 #define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
struct btrfs_ioctl_received_subvol_args)
+#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct 
btrfs_ioctl_send_args)
+
 #endif
-- 
1.7.10

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: BTRFS fsck apparent errors

2012-07-04 Thread David Sterba

On Wed, Jul 04, 2012 at 07:40:05AM +0700, Fajar A. Nugraha wrote:
> Are there any known btrfs regression in 3.4? I'm using 3.4.0-3-generic
> from a ppa, but a normal mount - umount cycle seems MUCH longer
> compared to how it was on 3.2, and iostat shows the disk is
> read-IOPS-bound

Is it just mount/umount without any other activity? Is the fs
fragmented (or aged), almost full, has lots of files?

> 
> # time mount LABEL=WD-root
> 
> real  0m10.400s
> user  0m0.000s
> sys   0m0.060s
> 
> # time umount /media/WD-root/
> 
> real  0m22.419s
> user  0m0.000s
> sys   0m0.064s
> 
> # /proc/10142/stack  <--- the PID of umount process

The process(es) actually doing the work are the btrfs workers, usual
sucspects are btrfs-cache (free space cache) or btrfs-ino (inode cache)
that are writing the cache states back to disk.
I'm using iotop to observe such things.

david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 3.4.4: BUG: Bad rss-counter state x

2012-07-04 Thread David Sterba

Hi,

On Tue, Jul 03, 2012 at 10:05:55PM -0700, Marc MERLIN wrote:
> Since I heard absolutely nothing on my last but and corruption report, I'm
> not sure if they are useful or wanted (please let me know).

They are, both useful and wanted, thanks.

> The last thing I've seen with 3.4.4 is this:
> kernel: [116130.309667] btrfs: unlinked 25 orphans
> kernel: [117951.440823] BUG: Bad rss-counter state mm:8801e2a4c080 idx:1 
> val:-1
> kernel: [117951.440832] BUG: Bad rss-counter state mm:8801e2a4c080 idx:2 
> val:1
> kernel: [119720.558186] btrfs: unlinked 12 orphans

This is not a btrfs bug, first hit for 'bug rss-counter' pointed me to

http://comments.gmane.org/gmane.linux.kernel/1316358 (kernel 3.4.2)
->
https://lkml.org/lkml/2012/6/9/47

david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: kmem_cache_destroy called for cache that still has objects

2012-07-04 Thread Jan Schmidt

Hi Martin,

On 02.07.2012 17:41, Jan Schmidt wrote:
> I'm recently seeing such buffer leaks as well. It's always about tree blocks,
> and it's always just two or three refs left for a buffer. I can somehow
> reproduce it and will be looking into it probably tomorrow. (May still be
> unrelated, of course.)

Okay, I've tracked my buffer leak down. It's most likely unrelated to yours, as
you need Chris' current for-linus for that to occur. Second, you need to trigger
a lot of tree mod log operations (i.e. by calling "btrfs inspect-internal" while
modifying the trees).

I'm sending a fix for the problem I found. However, there's no obvious
reproducer for your problem yet. Can you reproduce it?

-Jan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/4] Btrfs: do not count in readonly bytes

2012-07-04 Thread Liu Bo

If a block group is ro, do not count its entries in when we dump space info.

Signed-off-by: Liu Bo 
---
 fs/btrfs/free-space-cache.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a70c54e..2d5b42e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1967,7 +1967,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache 
*block_group,
 
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
-   if (info->bytes >= bytes)
+   if (info->bytes >= bytes && !block_group->ro)
count++;
printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
   (unsigned long long)info->offset,
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/4] Btrfs: fix a bug of writting free space cache with nodatacow option

2012-07-04 Thread Liu Bo

We can set a block group readonly when we relocate the block group.

If the block group covers the disk offset where our free space
cache inode is going to write, it will force the free space cache
inode into cow_file_range(), which is not allowed due to free space
cache design.

Signed-off-by: Liu Bo 
---
 fs/btrfs/inode.c |6 +-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1f72817..4892396 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1245,7 +1245,11 @@ next_slot:
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
goto out_check;
-   if (btrfs_extent_readonly(root, disk_bytenr))
+   /*
+* nolock means that we're free space cache inode, and
+* free space cache inode cannot go to cow_file_range().
+*/
+   if (btrfs_extent_readonly(root, disk_bytenr) && !nolock)
goto out_check;
if (btrfs_cross_ref_exist(trans, root, ino,
  found_key.offset -
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/4] Btrfs: do not abort transaction in prealloc case

2012-07-04 Thread Liu Bo

During disk balance, we prealloc new file extent for file data relocation,
but we may fail in 'no available space' case, and only under this case can
the error be reported to userspace, so we do not need to abort transaction
here.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.h   |8 
 fs/btrfs/extent-tree.c |   13 +++--
 fs/btrfs/inode.c   |8 
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84ac723..821a556 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2564,10 +2564,10 @@ int btrfs_alloc_logged_file_extent(struct 
btrfs_trans_handle *trans,
   u64 root_objectid, u64 owner, u64 offset,
   struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 num_bytes, u64 min_alloc_size,
- u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, u64 data);
+struct btrfs_root *root,
+u64 num_bytes, u64 min_alloc_size,
+u64 empty_size, u64 hint_byte,
+struct btrfs_key *ins, u64 data, int abort_on_enospc);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5775dc4..bbe79ab 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5419,7 +5419,7 @@ static noinline int find_free_extent(struct 
btrfs_trans_handle *trans,
 struct btrfs_root *orig_root,
 u64 num_bytes, u64 empty_size,
 u64 hint_byte, struct btrfs_key *ins,
-u64 data)
+u64 data, int abort_on_enospc)
 {
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -5778,8 +5778,9 @@ loop:
 2 * 1024 * 1024, data,
 CHUNK_ALLOC_LIMITED);
if (ret < 0) {
-   btrfs_abort_transaction(trans,
-   root, ret);
+   if (abort_on_enospc)
+   btrfs_abort_transaction(trans,
+   root, ret);
goto out;
}
allowed_chunk_alloc = 0;
@@ -5864,7 +5865,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
 u64 num_bytes, u64 min_alloc_size,
 u64 empty_size, u64 hint_byte,
-struct btrfs_key *ins, u64 data)
+struct btrfs_key *ins, u64 data, int abort_on_enospc)
 {
bool final_tried = false;
int ret;
@@ -5887,7 +5888,7 @@ again:
 
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
-  hint_byte, ins, data);
+  hint_byte, ins, data, abort_on_enospc);
 
if (ret == -ENOSPC) {
if (!final_tried) {
@@ -6294,7 +6295,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct 
btrfs_trans_handle *trans,
return ERR_CAST(block_rsv);
 
ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
-  empty_size, hint, &ins, 0);
+  empty_size, hint, &ins, 0, 1);
if (ret) {
unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9f07bd1..1f72817 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -662,7 +662,7 @@ retry:
ret = btrfs_reserve_extent(trans, root,
   async_extent->compressed_size,
   async_extent->compressed_size,
-  0, alloc_hint, &ins, 1);
+  0, alloc_hint, &ins, 1, 1);
if (ret)
btrfs_abort_transaction(trans, root, ret);
btrfs_end_transaction(trans, root);
@@ -888,7 +888,7 @@ static noinline int cow_file_range(struct inode *inode,
cu

[PATCH 3/4] Btrfs: add ro notification to dump_space_info

2012-07-04 Thread Liu Bo

Block group has ro attributes, make dump_space_info show it.

Signed-off-by: Liu Bo 
---
 fs/btrfs/extent-tree.c |5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bbe79ab..f29859b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5847,12 +5847,13 @@ again:
list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
-  "%llu pinned %llu reserved\n",
+  "%llu pinned %llu reserved %s\n",
   (unsigned long long)cache->key.objectid,
   (unsigned long long)cache->key.offset,
   (unsigned long long)btrfs_block_group_used(&cache->item),
   (unsigned long long)cache->pinned,
-  (unsigned long long)cache->reserved);
+  (unsigned long long)cache->reserved,
+  cache->ro ? "[readonly]" : "");
btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
}
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: fix buffer leak in btrfs_next_old_leaf

2012-07-04 Thread Jan Schmidt

When calling btrfs_next_old_leaf, we were leaking an extent buffer in the
rare case of using the deadlock avoidance code needed for the tree mod log.

Signed-off-by: Jan Schmidt 
---
 fs/btrfs/ctree.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

Chris: you can either put it on top of your current for-linus branch,
use it as a fixup to d42244a (part of your for-linus), or even keep it
for the next rc, as that buffer leak should be really rare.

-Jan

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8206b39..67fe46f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5127,6 +5127,7 @@ again:
 * locked. To solve this situation, we give up
 * on our lock and cycle.
 */
+   free_extent_buffer(next);
btrfs_release_path(path);
cond_resched();
goto again;
-- 
1.7.3.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 3.4.4: BUG: Bad rss-counter state x

2012-07-04 Thread Marc MERLIN

On Wed, Jul 04, 2012 at 03:53:05PM +0200, David Sterba wrote:
> Hi,
> 
> On Tue, Jul 03, 2012 at 10:05:55PM -0700, Marc MERLIN wrote:
> > Since I heard absolutely nothing on my last but and corruption report, I'm
> > not sure if they are useful or wanted (please let me know).
> 
> They are, both useful and wanted, thanks.
> 
> > The last thing I've seen with 3.4.4 is this:
> > kernel: [116130.309667] btrfs: unlinked 25 orphans
> > kernel: [117951.440823] BUG: Bad rss-counter state mm:8801e2a4c080 
> > idx:1 val:-1
> > kernel: [117951.440832] BUG: Bad rss-counter state mm:8801e2a4c080 
> > idx:2 val:1
> > kernel: [119720.558186] btrfs: unlinked 12 orphans
> 
> This is not a btrfs bug, first hit for 'bug rss-counter' pointed me to
> 
> http://comments.gmane.org/gmane.linux.kernel/1316358 (kernel 3.4.2)
> ->
> https://lkml.org/lkml/2012/6/9/47

Doh, I was not awake when I sent this yesterday, sorry for the false alert.

Thanks for the reply,
Marc
-- 
"A mouse is a device used to point at the xterm you want to type in" - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/  
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 0/6] Experimental btrfs send/receive (btrfs-progs)

2012-07-04 Thread Chris Mason

On Wed, Jul 04, 2012 at 07:39:28AM -0600, Alexander Block wrote:
> Hello all,
> 
> This is the user space side of btrfs send/receive.
> 
> You can apply them manually or use my git repo:
> 
> git://github.com/ablock84/btrfs-progs.git (branch send)
> 
> The branch is based on Hugo's integration-20120605 branch. I had to add a 
> temporary
> commit to fix a bug introduced in one of the strncpy/overflow patches that 
> got into
> btrfs-progs. This fix is not part of the btrfs send/receive patchset, but 
> you'll
> probably need it if you want to base on the integration branch. I hope this 
> is not
> required in the future when a new integration branch comes out.

Just awesome.  I'm playing with this now.

Except for the arm patch, I have the integration here in final testing
for 0.20.  This stuff will make 0.21 when it is all done.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Long btrfs hangs during suspend to RAM / BTRFS warning (device dm-0): Aborting unused transaction

2012-07-04 Thread Marc MERLIN

On Wed, Jul 04, 2012 at 01:58:31PM +0800, Liu Bo wrote:
> The dmesg log, sysrq log and stack dump info can usually be very helpful.
> 
> From your report, we can see the csum error and hang on log,
> 'no csum' is not that bad while hanging-on is serious and dangerous.
> 
> so can you please get any 'sysrq + w' log in the hanging-on case and paste 
> them here,
> and the log may tell us who blocks other threads.

Hi, thanks for the answer.
 
I dumped all sysrq data, that was in my original Email. Here are two
different sysrq+w runs, as well as aborted transaction messages from that
Email. 
Sorry that the original was a bit long and contained a bunch of sysrq output.

>From doing further testing since then, it does seem that the code just start
doing bad things, including the file corruption I saw, when I'm running low
on free space.

Anything else that would help?

Thanks,
Marc

> [84951.680847] Sched Debug Version: v0.10, 3.4.4-amd64-preempt-noide-20120410 
> #1
> [84951.680850] ktime   : 84951680.847904
> [84951.680851] sched_clk   : 55820767.832790
> [84951.680853] cpu_clk : 84951680.846676
> [84951.680855] jiffies : 4316130216
> [84951.680857] sched_clock_stable  : 0
> [84951.680858] 
> [84951.680859] sysctl_sched
> [84951.680860]   .sysctl_sched_latency: 12.00
> [84951.680862]   .sysctl_sched_min_granularity: 1.50
> [84951.680864]   .sysctl_sched_wakeup_granularity : 2.00
> [84951.680865]   .sysctl_sched_child_runs_first   : 0
> [84951.680867]   .sysctl_sched_features   : 24119
> [84951.680869]   .sysctl_sched_tunable_scaling: 1 (logaritmic)
> [84951.680871] 
> [84951.680872] cpu#0, 2527.110 MHz
> [84951.680873]   .nr_running: 4
> [84951.680875]   .load  : 1542
> [84951.680876]   .nr_switches   : 298015357
> [84951.680878]   .nr_load_updates   : 16858906
> [84951.680879]   .nr_uninterruptible: -7812
> [84951.680881]   .next_balance  : 4316.130229
> [84951.680883]   .curr->pid : 6859
> [84951.680884]   .clock : 84951679.104864
> [84951.680886]   .cpu_load[0]   : 1277
> [84951.680887]   .cpu_load[1]   : 885
> [84951.680889]   .cpu_load[2]   : 728
> [84951.680890]   .cpu_load[3]   : 693
> [84951.680892]   .cpu_load[4]   : 731
> [84951.680894] 
> [84951.680894] cfs_rq[0]:/autogroup-69
> [84951.680896]   .exec_clock: 0.00
> [84951.680898]   .MIN_vruntime  : 0.01
> [84951.680899]   .min_vruntime  : 194572.976125
> [84951.680901]   .max_vruntime  : 0.01
> [84951.680903]   .spread: 0.00
> [84951.680904]   .spread0   : -52160826.593685
> [84951.680906]   .nr_spread_over: 0
> [84951.680907]   .nr_running: 0
> [84951.680909]   .load  : 0
> [84951.680910]   .load_avg  : 5179.920896
> [84951.680912]   .load_period   : 8.310703
> [84951.680913]   .load_contrib  : 623
> [84951.680915]   .load_tg   : 623
> [84951.680917]   .se->exec_start: 84951675.234336
> [84951.680918]   .se->vruntime  : 52355393.270692
> [84951.680920]   .se->sum_exec_runtime  : 241715.485721
> [84951.680922]   .se->load.weight   : 2
> [84951.680923] 
> [84951.680924] cfs_rq[0]:/autogroup-20
> [84951.680925]   .exec_clock: 0.00
> [84951.680927]   .MIN_vruntime  : 0.01
> [84951.680929]   .min_vruntime  : 13027.823103
> [84951.680930]   .max_vruntime  : 0.01
> [84951.680932]   .spread: 0.00
> [84951.680934]   .spread0   : -52342371.746707
> [84951.680935]   .nr_spread_over: 0
> [84951.680937]   .nr_running: 0
> [84951.680938]   .load  : 0
> [84951.680940]   .load_avg  : 1279.999872
> [84951.680941]   .load_period   : 6.444962
> [84951.680943]   .load_contrib  : 198
> [84951.680944]   .load_tg   : 257
> [84951.680946]   .se->exec_start: 84951664.094034
> [84951.680948]   .se->vruntime  : 52355389.621584
> [84951.680950]   .se->sum_exec_runtime  : 13082.655453
> [84951.680951]   .se->load.weight   : 2
> [84951.680953] 
> [84951.680954] cfs_rq[0]:/autogroup-74
> [84951.680955]   .exec_clock: 0.00
> [84951.680957]   .MIN_vruntime  : 1323610.934982
> [8495

df shows wrong infos on btrfs raid (5gb and 3gb hdd)

2012-07-04 Thread Bernd Kohler

Hi,

this is not really a bug in btrfs but to spread the info I will just
drop this short message:

My System (VirtualBox VM, 3 virtual HDDs with 10G, 5G and 3G) is today
installed Ubuntu 12.04 LTS 64bit with Kernel 3.2.0-26 generic, a
/dev/sda hdd containing /boot with ext4 and / with btrfs.

After installation update/upgrade process I created a new partition on
/dev/sdb and /dev/sdbc with max available size - 5G for sdb1 and 3G for
sdc1.

Now I made my RAID:
mkfs.btrfs -L MYRAID -d raid1 -m raid1 /dev/sdb1 /sdc1

and made this RAID available to the filesystem
mount /dev/sdc1 /mnt

The infos, given by "btrfs fi show  /dev/sdb1" and "btrfs fi df /mnt"
are ok, but df is "lieing":

Filesystem Size Used Avail Use% Mounted on
/dev/sdc1  8.0G 5.2G 56k   100% /mnt

Don't get confused about used space, tried to write a 4G file on this
RAID, which ended with "write error: no space left on device" - as expected

FYI

best

Bernd Kohler

-- 
UMIC - RWTH Aachen
http://www.umic.rwth-aachen.de

Mies-van-der-Rohe Str. 15
52074 Aachen

Tel.:   +49 241 80 20791
Fax:+49 241 80 22731
E-Mail: koh...@umic.rwth-aachen.de

The future started 6/6/12
~~
0100 1001 0101  0111 0110 0011 0110




smime.p7s
Description: S/MIME Cryptographic Signature

Re: [PATCH 3/3] Btrfs-progs: add 's' option for 'btrfs subvolume list'

2012-07-04 Thread David Sterba

On Fri, Jun 29, 2012 at 06:00:38PM +0800, Liu Bo wrote:
> We want 'btrfs subvolume list' to act as 'ls', which means that
> we can not only list out all the subvolumes we have, but also list
> each single one.
> 
> So this patch add 's' option to show a single one:
> 
> $ ./btrfs sub list /mnt/btrfs/
> ID 256 top level 5 path subvol (Readonly,)
> ID 257 top level 5 path snapshot
> ID 258 top level 5 path subvol2
> ID 259 top level 5 path subvol2/subvol3
> 
> $ ./btrfs sub list -s /mnt/btrfs/subvol
> ID 256 top level 5 path subvol (Readonly,)

suggestions and comments:

1) show the subvolume flags only if an option is given, similar to -p
   (to show parent subvol),

2) move the flags before the subvolume path -- it is of a
   variable length and it's a bit easier (but not reliable) to parse it
   from scripts

sidenote: 'find /mnt -inum 256 -print0' will list all subvolumes in a
way that's resitent to funny characters in the path, but is slow as it
has to traverse the filesystem (and 'find' does not support a true
breadth-first-search, needs to be iterated with -mindepth/maxdepth).

the 'subvol list' command could mimic the -print0 and print the
subvolume paths terminated by '\0'.

3) drop the , if there's only one subvol property

4) the '-s' option on the mountpoint does not show anything, though I
   would expect that, eg when the mountpoint is a subvolume

--

one comment to code below

david

> 
> Signed-off-by: Liu Bo 
> ---
>  btrfs-list.c |   41 -
>  cmds-subvolume.c |   15 ++-
>  2 files changed, 50 insertions(+), 6 deletions(-)
> 
> diff --git a/btrfs-list.c b/btrfs-list.c
> index f1baa52..3e79239 100644
> --- a/btrfs-list.c
> +++ b/btrfs-list.c
> @@ -312,6 +312,30 @@ static int lookup_ino_path(int fd, struct root_info *ri)
>   return 0;
>  }
>  
> +/*
> + * helper function for getting the root which the file is belonged to.
> + */
> +static int lookup_ino_rootid(int fd, u64 *rootid)
> +{
> + struct btrfs_ioctl_ino_lookup_args args;
> + int ret, e;
> +
> + memset(&args, 0, sizeof(args));
> + args.treeid = 0;
> + args.objectid = BTRFS_FIRST_FREE_OBJECTID;
> +
> + ret = ioctl(fd, BTRFS_IOC_INO_LOOKUP, &args);
> + e = errno;
> + if (ret) {
> + fprintf(stderr, "ERROR: Failed to lookup root id - %s\n",
> + strerror(e));
> + return ret;
> + }
> +
> + *rootid = args.treeid;
> + return 0;
> +}
> +
>  /* finding the generation for a given path is a two step process.
>   * First we use the inode loookup routine to find out the root id
>   *
> @@ -704,11 +728,12 @@ int subvol_get_set_flags(int fd, int set, u64 flags, 
> u64 root_id)
>   return 0;
>  }
>  
> -int list_subvols(int fd, int print_parent, int get_default)
> +int list_subvols(int fd, int print_parent, int print_self, int get_default)
>  {
>   struct root_lookup root_lookup;
>   struct rb_node *n;
>   int ret;
> + u64 subvolid = 0;
>  
>   ret = __list_subvol_search(fd, &root_lookup);
>   if (ret) {
> @@ -725,6 +750,9 @@ int list_subvols(int fd, int print_parent, int 
> get_default)
>   if (ret < 0)
>   return ret;
>  
> + if (print_self)
> + lookup_ino_rootid(fd, &subvolid);

you should probably check the return value

> +
>   /* now that we have all the subvol-relative paths filled in,
>* we have to string the subvols together so that we can get
>* a path all the way back to the FS root
> @@ -739,6 +767,14 @@ int list_subvols(int fd, int print_parent, int 
> get_default)
>   entry = rb_entry(n, struct root_info, rb_node);
>   resolve_root(&root_lookup, entry, &root_id, &parent_id,
>   &level, &path);
> +
> + /* print this subvolume only */
> + if (print_self && subvolid != root_id) {
> + free(path);
> + n = rb_prev(n);
> + continue;
> + }
> +
>   if (print_parent) {
>   printf("ID %llu parent %llu top level %llu path %s",
>   (unsigned long long)root_id,
> @@ -753,6 +789,9 @@ int list_subvols(int fd, int print_parent, int 
> get_default)
>   printf("\n");
>   free(path);
>   n = rb_prev(n);
> +
> + if (print_self)
> + break;
>   }
>  
>   return ret;
> diff --git a/cmds-subvolume.c b/cmds-subvolume.c
> index 8783e67..f779b78 100644
> --- a/cmds-subvolume.c
> +++ b/cmds-subvolume.c
> @@ -30,7 +30,7 @@
>  #include "commands.h"
>  
>  /* btrfs-list.c */
> -int list_subvols(int fd, int print_parent, int get_default);
> +int list_subvols(int fd, int print_parent, int print_self, int get_default);
>  int find_updated_files(int fd, u64 root_id, u64 oldest_gen);
>  int subvol_get_set_flags(int fd, int set, u64 flags, u64 root_id);
>  
> @@ -21

Re: BTRFS fsck apparent errors

2012-07-04 Thread Fajar A. Nugraha

On Wed, Jul 4, 2012 at 8:42 PM, David Sterba  wrote:
> On Wed, Jul 04, 2012 at 07:40:05AM +0700, Fajar A. Nugraha wrote:
>> Are there any known btrfs regression in 3.4? I'm using 3.4.0-3-generic
>> from a ppa, but a normal mount - umount cycle seems MUCH longer
>> compared to how it was on 3.2, and iostat shows the disk is
>> read-IOPS-bound
>
> Is it just mount/umount without any other activity?

Yes

> Is the fs
> fragmented

Not sure how to check that quickly

> (or aged),

Over 1 year, so yes

> almost full,

df says 83% used, so probably yes (depending on how you define "almost")

~ $ df -h /media/WD-root
Filesystem  Size  Used Avail Use% Mounted on
/dev/sdc2   922G  733G  155G  83% /media/WD-root

~ $ sudo btrfs fi df /media/WD-root/
Data: total=883.95GB, used=729.68GB
System, DUP: total=8.00MB, used=104.00KB
System: total=4.00MB, used=0.00
Metadata, DUP: total=18.75GB, used=1.49GB
Metadata: total=8.00MB, used=0.00

> has lots of files?

it's a "normal" 1 TB usb disk, with docs, movies, vm images, etc. No
particular lots-of-small-files like maildir or anything like that.


>> # time umount /media/WD-root/
>>
>> real  0m22.419s
>> user  0m0.000s
>> sys   0m0.064s
>>
>> # /proc/10142/stack  <--- the PID of umount process
>
> The process(es) actually doing the work are the btrfs workers, usual
> sucspects are btrfs-cache (free space cache) or btrfs-ino (inode cache)
> that are writing the cache states back to disk.

Not sure about that, since iostat shows it's mostly read, not write.
Will try iotop later.
I tested also with Chris' for-linus on top of 3.4, same result (really
long time to umount).

Reverting back to ubuntu's 3.2.0-26-generic, umount only took less than 1 s :P
So I guess I'm switching back to 3.2 for now.

-- 
Fajar
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: btrfs_print_tree?

2012-07-04 Thread David Sterba

On Sun, Jul 01, 2012 at 06:16:55PM +0800, Jeff Liu wrote:
> On 07/01/2012 05:49 PM, Zhi Yong Wu wrote:
> 
> > On Sun, Jul 1, 2012 at 5:41 PM, Mike Fleetwood
> > No, i also did as this, but didn't find out who will invoke this
> > function. From above output, we only saw that it invokes itself one
> > time.
> 
> Looks this is a helper routine exported to btrfs-progs previously, it is
> used by debug-tree, quick-test, etc...
> 
> But this function has been implemented at btrfs-progs now, maybe it
> could be safely removed from kernel, not sure. :)

Please do not remove that function, it could be helpful for debugging
prints.

david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: btrfs GPF in read_extent_buffer() while scrubbing with kernel 3.4.2

2012-07-04 Thread Sami Liedes

On Wed, Jul 04, 2012 at 01:26:46PM +0200, Jan Schmidt wrote:
> On 04.07.2012 02:17, Sami Liedes wrote:
> > On Wed, Jul 04, 2012 at 01:47:56AM +0300, Sami Liedes wrote:
> >> I've seen this before: An overly long "Modules linked in:" line causes
> >> a large gap in netconsole output.
> > 
> > I managed to capture the entire output using netconsole by modifying
> > the kernel to not output the list of modules.
> 
> Okay, thanks for the output. Can you please apply the patch below and capture
> especially the line printed before the "cut here" line?

Here you go.

Sami


[  121.524803] netpoll: netconsole: local port 
[  121.524831] netpoll: netconsole: local IP 192.168.1.2
[  121.524853] netpoll: netconsole: interface 'eth0'
[  121.524874] netpoll: netconsole: remote port 1194
[  121.524894] netpoll: netconsole: remote IP 192.168.1.73
[  121.524917] netpoll: netconsole: remote ethernet address 00:1c:10:44:47:2c
[  121.525055] console [netcon0] enabled
[  121.525074] netconsole: network logging started
[  200.980496] btrfs: invalid parameters for read_extent_buffer: start (32771) 
> eb->len (32768). eb start is 2243489562624, level 26, generation 
3144240307695375391, nritems 620178657. len param 17. debug 
2/989/620178657/3144240307695375391
[  200.980594] [ cut here ]
[  200.980644] WARNING: at fs/btrfs/extent_io.c:4528 
read_extent_buffer+0x167/0x1a0 [btrfs]()
[  200.980681] Hardware name: System Product Name
[  200.980701] Modules linked in:   [last unloaded: scsi_wait_scan]
[  200.980739] Pid: 1145, comm: btrfs-endio-met Tainted: GW
3.4.4-modded-oops+ #2
[  200.980774] Call Trace:
[  200.980792]  [] warn_slowpath_common+0x7a/0xb0
[  200.980821]  [] warn_slowpath_null+0x15/0x20
[  200.980860]  [] read_extent_buffer+0x167/0x1a0 [btrfs]
[  200.980902]  [] btrfs_node_key+0x1d/0x20 [btrfs]
[  200.980941]  [] __readahead_hook.isra.5+0x3ff/0x460 [btrfs]
[  200.980982]  [] btree_readahead_hook+0x1f/0x40 [btrfs]
[  200.981022]  [] btree_readpage_end_io_hook+0x111/0x260 
[btrfs]
[  200.981065]  [] ? find_first_extent_bit_state+0x22/0x80 
[btrfs]
[  200.981109]  [] end_bio_extent_readpage+0xcb/0xa30 [btrfs]
[  200.981150]  [] ? end_workqueue_fn+0x31/0x50 [btrfs]
[  200.981182]  [] bio_endio+0x18/0x30
[  200.981214]  [] end_workqueue_fn+0x3c/0x50 [btrfs]
[  200.981253]  [] worker_loop+0x157/0x560 [btrfs]
[  200.981291]  [] ? btrfs_queue_worker+0x310/0x310 [btrfs]
[  200.981323]  [] kthread+0x8e/0xa0
[  200.981348]  [] kernel_thread_helper+0x4/0x10
[  200.981377]  [] ? flush_kthread_worker+0x70/0x70
[  200.981406]  [] ? gs_change+0x13/0x13
[  200.981430] ---[ end trace e93713a9d40cd06e ]---
[  200.981459] general protection fault:  [#1] SMP 
[  200.981487] CPU 2 
[  200.981498] Modules linked in:   [last unloaded: scsi_wait_scan]
[  200.981540] 
[  200.981550] Pid: 1145, comm: btrfs-endio-met Tainted: GW
3.4.4-modded-oops+ #2 System manufacturer System Product Name/P8P67 EVO
[  200.981612] RIP: 0010:[]  [] 
memcpy+0xd/0x110
[  200.981650] RSP: :8801f4bf7b68  EFLAGS: 00010202
[  200.981675] RAX: 8801f4bf7c8f RBX: 0011 RCX: 0002
[  200.981707] RDX: 0001 RSI: 00050803 RDI: 8801f4bf7c8f
[  200.981738] RBP: 8801f4bf7be0 R08:  R09: 
[  200.981769] R10: 8801f4bf7c8f R11: 8801f3930780 R12: 8801f4bf7ca0
[  200.981800] R13: 8801f7286178 R14: 0048 R15: 0011
[  200.981832] FS:  () GS:88021ec8() 
knlGS:
[  200.981868] CS:  0010 DS:  ES:  CR0: 8005003b
[  200.981894] CR2: f773a000 CR3: 00020d0de000 CR4: 000407e0
[  200.981925] DR0:  DR1:  DR2: 
[  200.981956] DR3:  DR6: 0ff0 DR7: 0400
[  200.981988] Process btrfs-endio-met (pid: 1145, threadinfo 8801f4bf6000, 
task 8801f4ff42f0)
[  200.982026] Stack:
[  200.982038]  a01d6d5b 880124f72ce1 0011 
0002
[  200.982084]  03dd 24f72ce1 2ba295e6a4a5d41f 
88020dc65000
[  200.982130]  8801f4bf7c8f 1000 8801f4bf7c58 
03dd
[  200.982175] Call Trace:
[  200.982201]  [] ? read_extent_buffer+0xbb/0x1a0 [btrfs]
[  200.982243]  [] btrfs_node_key+0x1d/0x20 [btrfs]
[  200.982281]  [] __readahead_hook.isra.5+0x3ff/0x460 [btrfs]
[  200.982322]  [] btree_readahead_hook+0x1f/0x40 [btrfs]
[  200.982362]  [] btree_readpage_end_io_hook+0x111/0x260 
[btrfs]
[  200.982405]  [] ? find_first_extent_bit_state+0x22/0x80 
[btrfs]
[  200.982448]  [] end_bio_extent_readpage+0xcb/0xa30 [btrfs]
[  200.982489]  [] ? end_workqueue_fn+0x31/0x50 [btrfs]
[  200.982519]  [] bio_endio+0x18/0x30
[  200.982552]  [] end_workqueue_fn+0x3c/0x50 [btrfs]
[  200.982591]  [] worker_loop+0x157/0x560 [btrfs]
[  200

Re: btrfs GPF in read_extent_buffer() while scrubbing with kernel 3.4.2

2012-07-04 Thread Jan Schmidt

On 04.07.2012 18:03, Sami Liedes wrote:
> Here you go.
> 
>   Sami
> [...]
> [  200.980496] btrfs: invalid parameters for read_extent_buffer: start 
> (32771) > eb->len (32768). eb start is 2243489562624, level 26, generation 
> 3144240307695375391, nritems 620178657. len param 17. debug 
> 2/989/620178657/3144240307695375391

Wow, that's strange. Can you repeat your test once or twice and paste that line,
please? I'd like to get a feeling if the values are completely random.

Reading more of the readahead code now...

Thanks,
-Jan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 5/7] Btrfs: add btrfs_compare_trees function

2012-07-04 Thread Alex Lyakas

Hi Alex,

> +   spin_lock(&left_root->root_times_lock);
> +   ctransid = btrfs_root_ctransid(&left_root->root_item);
> +   spin_unlock(&left_root->root_times_lock);
> +   if (ctransid != left_start_ctransid)
> +   left_start_ctransid = 0;
> +
> +   spin_lock(&right_root->root_times_lock);
> +   ctransid = 
> btrfs_root_ctransid(&right_root->root_item);
> +   spin_unlock(&right_root->root_times_lock);
> +   if (ctransid != right_start_ctransid)
> +   left_start_ctransid = 0;
Shouldn't it be here right_start_ctransid=0? Otherwise,
right_start_ctransid is pretty useless in this function.

> +
> +   if (!left_start_ctransid || !right_start_ctransid) {
> +   WARN(1, KERN_WARNING
> +   "btrfs: btrfs_compare_tree detected "
> +   "a change in one of the trees while "
> +   "iterating. This is probably a "
> +   "bug.\n");
> +   ret = -EIO;
> +   goto out;
> +   }

I am reading the code have more questions (and comments), but will
send them all later.

Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 5/7] Btrfs: add btrfs_compare_trees function

2012-07-04 Thread Alex Lyakas

Hi Alex,

> +static int tree_compare_item(struct btrfs_root *left_root,
> +struct btrfs_path *left_path,
> +struct btrfs_path *right_path,
> +char *tmp_buf)
> +{
> +   int cmp;
> +   int len1, len2;
> +   unsigned long off1, off2;
> +
> +   len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
> +   len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
> +   if (len1 != len2)
> +   return 1;
> +
> +   off1 = btrfs_item_ptr_offset(left_path->nodes[0], 
> left_path->slots[0]);
> +   off2 = btrfs_item_ptr_offset(right_path->nodes[0],
> +   right_path->slots[0]);
> +
> +   read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
> +
> +   cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
> +   if (cmp)
> +   return 1;
> +   return 0;
> +}
It might be worth to note in the comment, that tmp_buff should be
large enough to hold the item from the left tree. Can it happen that
the right tree has a different leafsize?

> +   /*
> +* Strategy: Go to the first items of both trees. Then do
> +*
> +* If both trees are at level 0
> +*   Compare keys of current items
> +* If left < right treat left item as new, advance left tree
> +*   and repeat
> +* If left > right treat right item as deleted, advance right tree
> +*   and repeat
> +* If left == right do deep compare of items, treat as changed if
> +*   needed, advance both trees and repeat
> +* If both trees are at the same level but not at level 0
> +*   Compare keys of current nodes/leafs
> +* If left < right advance left tree and repeat
> +* If left > right advance right tree and repeat
> +* If left == right compare blockptrs of the next nodes/leafs
> +*   If they match advance both trees but stay at the same level
> +* and repeat
> +*   If they don't match advance both trees while allowing to go
> +* deeper and repeat
> +* If tree levels are different
> +*   Advance the tree that needs it and repeat
> +*
> +* Advancing a tree means:
> +*   If we are at level 0, try to go to the next slot. If that's not
> +*   possible, go one level up and repeat. Stop when we found a level
> +*   where we could go to the next slot. We may at this point be on a
> +*   node or a leaf.
> +*
> +*   If we are not at level 0 and not on shared tree blocks, go one
> +*   level deeper.
> +*
> +*   If we are not at level 0 and on shared tree blocks, go one slot 
> to
> +*   the right if possible or go up and right.
> +*/
According to the strategy and to the code later, "left" tree is
treated as "newer one", while "right" as "older one", correct? Do you
think it would be more intuitive to make it the other way around,
although I guess this is a matter of personal taste. I had to draw the
leafs reversed to keep going:
R   L
- -
| | | | | | | |
- -


> +   if (advance_left && !left_end_reached) {
> +   ret = tree_advance(left_root, left_path, &left_level,
> +   left_root_level,
> +   advance_left != ADVANCE_ONLY_NEXT,
> +   &left_key);
> +   if (ret < 0)
> +   left_end_reached = ADVANCE;
> +   advance_left = 0;
> +   }
> +   if (advance_right && !right_end_reached) {
> +   ret = tree_advance(right_root, right_path, 
> &right_level,
> +   right_root_level,
> +   advance_right != ADVANCE_ONLY_NEXT,
> +   &right_key);
> +   if (ret < 0)
> +   right_end_reached = ADVANCE;
> +   advance_right = 0;
> +   }
Do you think it's worth it to put a check/warning/smth before that,
that either advance_right or advance_left is non-zero, or we have
reached ends in both trees?


> +   } else if (left_level == right_level) {
...
> +   } else if (left_level < right_level) {
> +   advance_right = ADVANCE;
> +   } else {
> +   advance_left = ADVANCE;
> +   }
Can you pls explain why it is correct?
Why if we are on lower level in the "newer" tree than we are in the
"older" tree, we need to advance the "older" tree? I.e., why this
implies that we are on the lower key in the "older" tree? (And
vice-versa)

Re: [RFC PATCH 5/7] Btrfs: add btrfs_compare_trees function

2012-07-04 Thread Alexander Block

On Wed, Jul 4, 2012 at 8:27 PM, Alex Lyakas
 wrote:
> Hi Alex,
>
>> +   spin_lock(&left_root->root_times_lock);
>> +   ctransid = 
>> btrfs_root_ctransid(&left_root->root_item);
>> +   spin_unlock(&left_root->root_times_lock);
>> +   if (ctransid != left_start_ctransid)
>> +   left_start_ctransid = 0;
>> +
>> +   spin_lock(&right_root->root_times_lock);
>> +   ctransid = 
>> btrfs_root_ctransid(&right_root->root_item);
>> +   spin_unlock(&right_root->root_times_lock);
>> +   if (ctransid != right_start_ctransid)
>> +   left_start_ctransid = 0;
> Shouldn't it be here right_start_ctransid=0? Otherwise,
> right_start_ctransid is pretty useless in this function.
>
Hmm you're right, it should be right_start_ctransid. However...the
code was working by accident because the next if does check for left
and right :)
Fixed that in my git repo.
>> +
>> +   if (!left_start_ctransid || !right_start_ctransid) {
>> +   WARN(1, KERN_WARNING
>> +   "btrfs: btrfs_compare_tree detected "
>> +   "a change in one of the trees while "
>> +   "iterating. This is probably a "
>> +   "bug.\n");
>> +   ret = -EIO;
>> +   goto out;
>> +   }
>
> I am reading the code have more questions (and comments), but will
> send them all later.
>
> Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 5/7] Btrfs: add btrfs_compare_trees function

2012-07-04 Thread Alexander Block

On Wed, Jul 4, 2012 at 9:13 PM, Alex Lyakas
 wrote:
> Hi Alex,
>
>> +static int tree_compare_item(struct btrfs_root *left_root,
>> +struct btrfs_path *left_path,
>> +struct btrfs_path *right_path,
>> +char *tmp_buf)
>> +{
>> +   int cmp;
>> +   int len1, len2;
>> +   unsigned long off1, off2;
>> +
>> +   len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
>> +   len2 = btrfs_item_size_nr(right_path->nodes[0], 
>> right_path->slots[0]);
>> +   if (len1 != len2)
>> +   return 1;
>> +
>> +   off1 = btrfs_item_ptr_offset(left_path->nodes[0], 
>> left_path->slots[0]);
>> +   off2 = btrfs_item_ptr_offset(right_path->nodes[0],
>> +   right_path->slots[0]);
>> +
>> +   read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
>> +
>> +   cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, 
>> len1);
>> +   if (cmp)
>> +   return 1;
>> +   return 0;
>> +}
> It might be worth to note in the comment, that tmp_buff should be
> large enough to hold the item from the left tree. Can it happen that
> the right tree has a different leafsize?
>
This function is only to be used for for the tree compare function and
there we allocate a buffer of root->leafsize, so definitely all items
should fit. As far as I know, Chris (please correct me if I'm wrong)
once guaranteed that ALL trees in a FS will have the same leaf size
and this will ever be the case.
>> +   /*
>> +* Strategy: Go to the first items of both trees. Then do
>> +*
>> +* If both trees are at level 0
>> +*   Compare keys of current items
>> +* If left < right treat left item as new, advance left tree
>> +*   and repeat
>> +* If left > right treat right item as deleted, advance right 
>> tree
>> +*   and repeat
>> +* If left == right do deep compare of items, treat as changed if
>> +*   needed, advance both trees and repeat
>> +* If both trees are at the same level but not at level 0
>> +*   Compare keys of current nodes/leafs
>> +* If left < right advance left tree and repeat
>> +* If left > right advance right tree and repeat
>> +* If left == right compare blockptrs of the next nodes/leafs
>> +*   If they match advance both trees but stay at the same level
>> +* and repeat
>> +*   If they don't match advance both trees while allowing to go
>> +* deeper and repeat
>> +* If tree levels are different
>> +*   Advance the tree that needs it and repeat
>> +*
>> +* Advancing a tree means:
>> +*   If we are at level 0, try to go to the next slot. If that's not
>> +*   possible, go one level up and repeat. Stop when we found a level
>> +*   where we could go to the next slot. We may at this point be on a
>> +*   node or a leaf.
>> +*
>> +*   If we are not at level 0 and not on shared tree blocks, go one
>> +*   level deeper.
>> +*
>> +*   If we are not at level 0 and on shared tree blocks, go one slot 
>> to
>> +*   the right if possible or go up and right.
>> +*/
> According to the strategy and to the code later, "left" tree is
> treated as "newer one", while "right" as "older one", correct? Do you
> think it would be more intuitive to make it the other way around,
> although I guess this is a matter of personal taste. I had to draw the
> leafs reversed to keep going:
> R   L
> - -
> | | | | | | | |
> - -
>
>
To be honest...I always preferred the way you suggested in the past
when I thought about compares. But for some reason, I didn't even
think about that and just implemented that function in single
flow...it took days until I've even noticed that I swapped left/right
in my head :D I now would like to stay with that, as all the btrfs
send code uses left/right in this way and I never had the problem with
mixing that up again. If people like, I have nothing against changing
that later if someone wants to, but that's nothing I would like to do
myself.
>> +   if (advance_left && !left_end_reached) {
>> +   ret = tree_advance(left_root, left_path, &left_level,
>> +   left_root_level,
>> +   advance_left != ADVANCE_ONLY_NEXT,
>> +   &left_key);
>> +   if (ret < 0)
>> +   left_end_reached = ADVANCE;
>> +   advance_left = 0;
>> +   }
>> +   if (advance_right && !right_end_reached) {
>> +   ret = tree_advance(right_root, right_path, 
>> &right_level,
>> +

Re: btrfs GPF in read_extent_buffer() while scrubbing with kernel 3.4.2

2012-07-04 Thread Sami Liedes

On Wed, Jul 04, 2012 at 06:38:00PM +0200, Jan Schmidt wrote:
> > [  200.980496] btrfs: invalid parameters for read_extent_buffer: start 
> > (32771) > eb->len (32768). eb start is 2243489562624, level 26, generation 
> > 3144240307695375391, nritems 620178657. len param 17. debug 
> > 2/989/620178657/3144240307695375391

Let's call this try 1. I ran it three more times, so below we have
tries 2, 3 and 4.

> Wow, that's strange. Can you repeat your test once or twice and paste that 
> line,
> please? I'd like to get a feeling if the values are completely random.

Curiously, it clearly takes longer for it to crash after starting the
scrub each time I run it. Also on try 4 I got an entirely different
crash (backtrace below). Now it scrubs maybe the first 200G or so of
both devices of the (raid-1) 2.2T filesystem before it crashes.

start and eb->len seem to be the same (32771 and 32768) every time. eb
start varies, but there's some pattern if you view them in hex:

  Try 1  20a5a66
  Try 2  20bb0018000
  Try 3  20a8bc28000
  Try 4  (no output, different crash)

The rest of the values seem to me to be completely different every time.

Sami


Try 2:


[12961.870107] btrfs: invalid parameters for read_extent_buffer: start (32771) 
> eb->len (32768). eb start is 2249220784128, level 14, generation 
2242260605927040034, nritems 117835525. len param 17. debug 
2/989/117835525/2242260605927040034
[12961.870204] [ cut here ]
[12961.870264] WARNING: at fs/btrfs/extent_io.c:4528 
read_extent_buffer+0x167/0x1a0 [btrfs]()
[12961.870302] Hardware name: System Product Name
[12961.870322] Modules linked in:   [last unloaded: scsi_wait_scan]
[12961.870367] Pid: 1144, comm: btrfs-endio-met Tainted: GW
3.4.4-modded-oops+ #2
[12961.870403] Call Trace:
[12961.870421]  [] warn_slowpath_common+0x7a/0xb0
[12961.870449]  [] warn_slowpath_null+0x15/0x20
[...]


Try 3:


[  531.770984] btrfs: invalid parameters for read_extent_buffer: start (32771) 
> eb->len (32768). eb start is 2244317708288, level 170, generation 
13639284858109917187, nritems 6171943. len param 17. debug 
2/989/6171943/13639284858109917187
[  531.771081] [ cut here ]
[  531.771133] WARNING: at fs/btrfs/extent_io.c:4528 
read_extent_buffer+0x167/0x1a0 [btrfs]()
[  531.771169] Hardware name: System Product Name
[  531.771191] Modules linked in:   [last unloaded: scsi_wait_scan]
[  531.771229] Pid: 1132, comm: btrfs-endio-met Tainted: GW
3.4.4-modded-oops+ #2
[  531.771265] Call Trace:
[  531.771282]  [] warn_slowpath_common+0x7a/0xb0
[...]


Try 4:


[   95.933108] netconsole: network logging started
[  982.651987] unable to find logical 691402650139365534 len 32768
[  982.652060] [ cut here ]
[  982.652085] kernel BUG at fs/btrfs/volumes.c:3725!
[  982.652109] invalid opcode:  [#1] SMP 
[  982.652138] CPU 4 
[  982.652149] Modules linked in:   [last unloaded: scsi_wait_scan]
[  982.652190] 
[  982.652201] Pid: 1127, comm: btrfs-endio-met Tainted: GW
3.4.4-modded-oops+ #2 System manufacturer System Product Name/P8P67 EVO
[  982.652264] RIP: 0010:[]  [] 
__btrfs_map_block+0x668/0x680 [btrfs]
[  982.652323] RSP: :8801f43cfa70  EFLAGS: 00010286
[  982.652347] RAX: 0049 RBX: 09985b0c0e52109e RCX: 0082
[  982.652379] RDX: 00e8 RSI: 0046 RDI: 0246
[  982.652411] RBP: 8801f43cfb10 R08:  R09: 
[  982.652443] R10: 8801f1fbf680 R11: 8801f3b00780 R12: 8801f0026108
[  982.652475] R13: 8000 R14: 8801f0026fe0 R15: 8801f43cfbb0
[  982.652507] FS:  () GS:88021ed0() 
knlGS:
[  982.652542] CS:  0010 DS:  ES:  CR0: 8005003b
[  982.652568] CR2: f775c000 CR3: 0002110a1000 CR4: 000407e0
[  982.652600] DR0:  DR1:  DR2: 
[  982.652632] DR3:  DR6: 0ff0 DR7: 0400
[  982.652664] Process btrfs-endio-met (pid: 1127, threadinfo 8801f43ce000, 
task 8801f401c2f0)
[  982.652703] Stack:
[  982.652715]  8801f43cfa90 811dc553 88021ec0ce88 
88021ed0ce80
[  982.652761]  8801f43cfb00 0086 8801f43cfaf0 
0086
[  982.652807]  00fdf43cfad0 8801f43cfba8 0024 
0004
[  982.652852] Call Trace:
[  982.652869]  [] ? cpumask_next_and+0x23/0x40
[  982.652897]  [] ? kmem_cache_alloc_trace+0xc2/0x100
[  982.652940]  [] btrfs_map_block+0x9/0x10 [btrfs]
[  982.652979]  [] reada_add_block+0x1c2/0x890 [btrfs]
[  9

Re: [RFC PATCH 5/7] Btrfs: add btrfs_compare_trees function

2012-07-04 Thread David Sterba

On Wed, Jul 04, 2012 at 10:18:34PM +0200, Alexander Block wrote:
> > It might be worth to note in the comment, that tmp_buff should be
> > large enough to hold the item from the left tree. Can it happen that
> > the right tree has a different leafsize?
> >
> This function is only to be used for for the tree compare function and
> there we allocate a buffer of root->leafsize, so definitely all items
> should fit. As far as I know, Chris (please correct me if I'm wrong)
> once guaranteed that ALL trees in a FS will have the same leaf size
> and this will ever be the case.

Not only leaves are of the same size in all trees, but also nodes, since
the metadata bigblocks patches.

david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: BTRFS fsck apparent errors

2012-07-04 Thread David Sterba

On Wed, Jul 04, 2012 at 10:46:21PM +0700, Fajar A. Nugraha wrote:
> > Is it just mount/umount without any other activity?
> Yes
> 
> > Is the fs
> > fragmented
> Not sure how to check that quickly
> 
> > (or aged),
> Over 1 year, so yes
> 
> > almost full,
> df says 83% used, so probably yes (depending on how you define "almost")

that matches my expectation that could lead to the mount/umount
slowness due to fragmentation

> > has lots of files?
> 
> it's a "normal" 1 TB usb disk, with docs, movies, vm images, etc. No
> particular lots-of-small-files like maildir or anything like that.

So it's probably not an issue with inode_cache.

> >> # time umount /media/WD-root/
> >>
> >> real  0m22.419s
> >> user  0m0.000s
> >> sys   0m0.064s
> >>
> >> # /proc/10142/stack  <--- the PID of umount process
> >
> > The process(es) actually doing the work are the btrfs workers, usual
> > sucspects are btrfs-cache (free space cache) or btrfs-ino (inode cache)
> > that are writing the cache states back to disk.
> 
> Not sure about that, since iostat shows it's mostly read, not write.
> Will try iotop later.
> I tested also with Chris' for-linus on top of 3.4, same result (really
> long time to umount).

Would be good to verify if it's the btrfs-cache worker or not, IIRC
there were more writes than reads, so I'm not sure this is the right
direction.

The 3.5 series or 3.4+for-linus has some changes wrt free space cache
(removed the 'ideal caching mode') that caused slow mounts but has been
fixed.

I've looked again at the umount process call stack, and it's waiting
for writing the btree_inode which is the representation of the b-tree
nodes, it's quite possible that changes to the generic writeback code is
causing this. AFAIK the btree_inode does not behave as a normal file
inode regarding writeback.  The good reference point is 3.2, there were
non-trivial writeback changes merged since.

Guessing now, if the mount causes eg. atime update, then this triggers
cow, dirties the btree_inode and needs to read data from disk,
fragmentation slows this down. Number of cowed blocks is small compared
to the reads (and maybe generic readahead reads more than what's
actually needed for the cow operation ...).

david
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: Add code to support file creation time.

2012-07-04 Thread Li Zefan

On 2012/7/4 19:04, Alexander Block wrote:

> On Wed, Jul 4, 2012 at 9:56 AM, Li Zefan  wrote:
>> On 2012/7/4 15:18, chandan r wrote:
>>
>>> This patch adds a new member to the 'struct btrfs_inode' structure to hold
>>> the file creation time.
>>>
>>
>>
>> Well, how do users use this file creation time? There's no syscall and 
>> there's
>> no ioctl that exports this information. That xstat syscall hasn't been 
>> accepted,
>> so you can revise and repost the patch when you see it happens.
> In my opinion we should still include this patch. Currently, otime is never 
> even
> initialized, having undefined values. If it ever gets possible to
> access otime, we
> would at least have some inodes with valid otime fields.


otime (on disk) is initialized to 0, not some undefined value. But yeah, your 
point makes
some sense, that with this patch we can access valid otime in an old filesystem 
once we
update to a new kernel which has otime support.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: Add code to support file creation time.

2012-07-04 Thread Alexander Block

On Thu, Jul 5, 2012 at 3:07 AM, Li Zefan  wrote:
> On 2012/7/4 19:04, Alexander Block wrote:
>
>> On Wed, Jul 4, 2012 at 9:56 AM, Li Zefan  wrote:
>>> On 2012/7/4 15:18, chandan r wrote:
>>>
 This patch adds a new member to the 'struct btrfs_inode' structure to hold
 the file creation time.

>>>
>>>
>>> Well, how do users use this file creation time? There's no syscall and 
>>> there's
>>> no ioctl that exports this information. That xstat syscall hasn't been 
>>> accepted,
>>> so you can revise and repost the patch when you see it happens.
>> In my opinion we should still include this patch. Currently, otime is never 
>> even
>> initialized, having undefined values. If it ever gets possible to
>> access otime, we
>> would at least have some inodes with valid otime fields.
>
>
> otime (on disk) is initialized to 0, not some undefined value. But yeah, your 
> point makes
> some sense, that with this patch we can access valid otime in an old 
> filesystem once we
> update to a new kernel which has otime support.
This is true for the inode items found in the root tree. But the inode
items found in the filesystem trees are not initialized at all. I did
a fast check by adding printing of the otime field in
btrfs-debug-tree...and every inode's otime looks random.
btrfs_new_inode uses btrfs_insert_empty_items which does not zero the
new item, then fill_inode_item is used to initialize the fields and
there otime is missing.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

43 matches

Mail list logo