Re: [PATCH v5 1/8] Btrfs: introduce a tree for items that map UUIDs to something

2013-06-24 Thread Stefan Behrens
On Fri, 21 Jun 2013 09:11:38 -0700, Zach Brown wrote:
 +  offset = (unsigned long)ptr;
 +  while (sub_item_len  0) {
 +  u64 data;
 +
 +  read_extent_buffer(eb, data, offset, sizeof(data));
 +  data = le64_to_cpu(data);
 +  if (data == subid) {
 +  ret = 0;
 +  break;
 +  }
 +  offset += sizeof(data);
 +  sub_item_len--;
 +  }

 This could be cleaned up a bit by comparing an on-stack little-endian
 input subid with each little-endian subid in the item with
 memcmp_extent_buffer().

 This would save some CPU cycles for the repeated le64_to_cpu() and for
 the memcpy(). The number of lines of code is equal for both ways.
 
 Hmm?  It would be many fewer lines of code.

Are you thinking of something shorter than the following?

offset = (unsigned long)ptr;
subid = cpu_to_le64(subid);
while (sub_item_len  0) {
if (memcmp_extent_buffer(eb, subid, offset,
 sizeof(subid)) == 0) {
ret = 0;
break;
}
offset += sizeof(subid);
sub_item_len--;
}

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/8] Btrfs: introduce a tree for items that map UUIDs to something

2013-06-24 Thread Zach Brown
  This would save some CPU cycles for the repeated le64_to_cpu() and for
  the memcpy(). The number of lines of code is equal for both ways.
  
  Hmm?  It would be many fewer lines of code.
 
 Are you thinking of something shorter than the following?

That's better, yeah, but there's still some room left to go.  Almost all
of the remaining code is boiler-plate iteration that could be done with
a for(;;).

This could be a mini OCC contest! :)

 offset = (unsigned long)ptr;
 subid = cpu_to_le64(subid);

 while (sub_item_len  0) {
   if (memcmp_extent_buffer(eb, subid, offset,
sizeof(subid)) == 0) {
   ret = 0;
   break;
   }
   offset += sizeof(subid);
   sub_item_len--;
 }

roughly:

unsigned long end = offset + sub_item_len;

for (ret = 1; offset  end  ret; offset += sizeof(subid))
ret = !!memcmp_extent_buffer(eb, subid, offset,
 sizeof(subid));

- z
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/8] Btrfs: introduce a tree for items that map UUIDs to something

2013-06-21 Thread Stefan Behrens
On Thu, 20 Jun 2013 12:47:04 -0700, Zach Brown wrote:
 
 +/* for items that use the BTRFS_UUID_KEY */
 +#define BTRFS_UUID_ITEM_TYPE_SUBVOL 0 /* for UUIDs assigned to subvols */
 +#define BTRFS_UUID_ITEM_TYPE_RECEIVED_SUBVOL1 /* for UUIDs assigned 
 to
 +   * received subvols */
 +
 +/* a sequence of such items is stored under the BTRFS_UUID_KEY */
 +struct btrfs_uuid_item {
 +__le16 type;/* refer to BTRFS_UUID_ITEM_TYPE* defines above */
 +__le32 len; /* number of following 64bit values */
 +__le64 subid[0];/* sequence of subids */
 +} __attribute__ ((__packed__));
 +
 
 [...]
 
  /*
 + * Stores items that allow to quickly map UUIDs to something else.
 + * These items are part of the filesystem UUID tree.
 + * The key is built like this:
 + * (UUID_upper_64_bits, BTRFS_UUID_KEY, UUID_lower_64_bits).
 + */
 +#if BTRFS_UUID_SIZE != 16
 +#error UUID items require BTRFS_UUID_SIZE == 16!
 +#endif
 +#define BTRFS_UUID_KEY  251
 
 Why do we need this btrfs_uuid_item structure?  Why not set the key type
 to either _SUBVOL or _RECEIVED_SUBVOL instead of embedding structs with
 those types under items with the constant BTRFS_UUID_KEY.  Then use the
 item size to determine the number of u64 subids.  Then the item has a
 simple array of u64s in the data which will be a lot easier to work
 with.

Maintaining a 16 bit uuid_type field inside the item is done in order to
avoid wasting type values for the key. The type field in the key has a
width of 8 bits, so far 34 type values are defined in ctree.h.

As long as such items are in separated trees, their type value could be
reused in the future when the 8 bit type field is exhausted.

There was some discussion in #btrfs about this topic on 2013-04-26.
There are pros and cons. The uuid_type field in the item allows to use
the uuid items generically outside of the uuid tree since it occupies
only one type value in the 8 bit type field. On the other hand, if the 8
bit type field in the key is exhausted, it saves lines of codes and
avoids complexity to implement a common way to expand this field instead
of implementing multiplexing layers at multiple places, although maybe
with some added performance penalty.

Anybody else with an opinion for this topic?


 +/* btrfs_uuid_item */
 +BTRFS_SETGET_FUNCS(uuid_type, struct btrfs_uuid_item, type, 16);
 +BTRFS_SETGET_FUNCS(uuid_len, struct btrfs_uuid_item, len, 32);
 +BTRFS_SETGET_STACK_FUNCS(stack_uuid_type, struct btrfs_uuid_item, type, 16);
 +BTRFS_SETGET_STACK_FUNCS(stack_uuid_len, struct btrfs_uuid_item, len, 32);
 
 This would all go away.
 
 +/*
 + * One key is used to store a sequence of btrfs_uuid_item items.
 + * Each item in the sequence contains a type information and a sequence of
 + * ids (together with the information about the size of the sequence of 
 ids).
 + * {[btrfs_uuid_item type0 {id0, id1, ..., idN}],
 + *  ...,
 + *  [btrfs_uuid_item typeZ {id0, id1, ..., idN}]}
 + *
 + * It is forbidden to put multiple items with the same type under the same 
 key.
 + * Instead the sequence of ids is extended and used to store any additional
 + * ids for the same item type.
 
 This constraint, and the cost of ensuring it and repairing violations,
 would go away.
 
 +static struct btrfs_uuid_item *btrfs_match_uuid_item_type(
 +struct btrfs_path *path, u16 type)
 +{
 +struct extent_buffer *eb;
 +int slot;
 +struct btrfs_uuid_item *ptr;
 +u32 item_size;
 +
 +eb = path-nodes[0];
 +slot = path-slots[0];
 +ptr = btrfs_item_ptr(eb, slot, struct btrfs_uuid_item);
 +item_size = btrfs_item_size_nr(eb, slot);
 +do {
 +u16 sub_item_type;
 +u64 sub_item_len;
 +
 +if (item_size  sizeof(*ptr)) {
 +pr_warn(btrfs: uuid item too short (%lu  %d)!\n,
 +(unsigned long)item_size, (int)sizeof(*ptr));
 +return NULL;
 +}
 +item_size -= sizeof(*ptr);
 +sub_item_type = btrfs_uuid_type(eb, ptr);
 +sub_item_len = btrfs_uuid_len(eb, ptr);
 +if (sub_item_len * sizeof(u64)  item_size) {
 +pr_warn(btrfs: uuid item too short (%llu  %lu)!\n,
 +(unsigned long long)(sub_item_len *
 + sizeof(u64)),
 +(unsigned long)item_size);
 +return NULL;
 +}
 +if (sub_item_type == type)
 +return ptr;
 +item_size -= sub_item_len * sizeof(u64);
 +ptr = 1 + (struct btrfs_uuid_item *)
 +(((char *)ptr) + (sub_item_len * sizeof(u64)));
 +} while (item_size);

 +static int btrfs_uuid_tree_lookup_prepare(struct btrfs_root *uuid_root,
 +  u8 *uuid, u16 type,
 +  struct btrfs_path *path,
 

Re: [PATCH v5 1/8] Btrfs: introduce a tree for items that map UUIDs to something

2013-06-21 Thread Zach Brown
  Why do we need this btrfs_uuid_item structure?  Why not set the key type
  to either _SUBVOL or _RECEIVED_SUBVOL instead of embedding structs with
  those types under items with the constant BTRFS_UUID_KEY.  Then use the
  item size to determine the number of u64 subids.  Then the item has a
  simple array of u64s in the data which will be a lot easier to work
  with.
 
 Maintaining a 16 bit uuid_type field inside the item is done in order to
 avoid wasting type values for the key. The type field in the key has a
 width of 8 bits, so far 34 type values are defined in ctree.h.
 
 As long as such items are in separated trees, their type value could be
 reused in the future when the 8 bit type field is exhausted.
 
 There was some discussion in #btrfs about this topic on 2013-04-26.
 There are pros and cons. The uuid_type field in the item allows to use
 the uuid items generically outside of the uuid tree since it occupies
 only one type value in the 8 bit type field. On the other hand, if the 8
 bit type field in the key is exhausted, it saves lines of codes and
 avoids complexity to implement a common way to expand this field instead
 of implementing multiplexing layers at multiple places, although maybe
 with some added performance penalty.
 
 Anybody else with an opinion for this topic?

Perhaps obviously, I'm strongly in favour of not rolling another layer
of indirection inside the items, especially when the code to implement
it is full of fragile pointer math.

  +  offset = (unsigned long)ptr;
  +  while (sub_item_len  0) {
  +  u64 data;
  +
  +  read_extent_buffer(eb, data, offset, sizeof(data));
  +  data = le64_to_cpu(data);
  +  if (data == subid) {
  +  ret = 0;
  +  break;
  +  }
  +  offset += sizeof(data);
  +  sub_item_len--;
  +  }
  
  This could be cleaned up a bit by comparing an on-stack little-endian
  input subid with each little-endian subid in the item with
  memcmp_extent_buffer().
 
 This would save some CPU cycles for the repeated le64_to_cpu() and for
 the memcpy(). The number of lines of code is equal for both ways.

Hmm?  It would be many fewer lines of code.

 read_extent_buffer() way is more straight forward and readable IMO.

  How does this extension avoid the BUG when the leaf containing the item
  doesn't have room for the new subid?
 
 btrfs_extend_item() does not BUG() when you have called
 btrfs_search_slot() before with ins_len  0.

Ah, got it.  Thanks.

- z
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/8] Btrfs: introduce a tree for items that map UUIDs to something

2013-06-21 Thread Chris Mason
Quoting Stefan Behrens (2013-06-21 04:47:20)
 On Thu, 20 Jun 2013 12:47:04 -0700, Zach Brown wrote:
  
  +/* for items that use the BTRFS_UUID_KEY */
  +#define BTRFS_UUID_ITEM_TYPE_SUBVOL 0 /* for UUIDs assigned to subvols */
  +#define BTRFS_UUID_ITEM_TYPE_RECEIVED_SUBVOL1 /* for UUIDs 
  assigned to
  +   * received subvols */
  +
  +/* a sequence of such items is stored under the BTRFS_UUID_KEY */
  +struct btrfs_uuid_item {
  +__le16 type;/* refer to BTRFS_UUID_ITEM_TYPE* defines above */
  +__le32 len; /* number of following 64bit values */
  +__le64 subid[0];/* sequence of subids */
  +} __attribute__ ((__packed__));
  +
  
  [...]
  
   /*
  + * Stores items that allow to quickly map UUIDs to something else.
  + * These items are part of the filesystem UUID tree.
  + * The key is built like this:
  + * (UUID_upper_64_bits, BTRFS_UUID_KEY, UUID_lower_64_bits).
  + */
  +#if BTRFS_UUID_SIZE != 16
  +#error UUID items require BTRFS_UUID_SIZE == 16!
  +#endif
  +#define BTRFS_UUID_KEY  251
  
  Why do we need this btrfs_uuid_item structure?  Why not set the key type
  to either _SUBVOL or _RECEIVED_SUBVOL instead of embedding structs with
  those types under items with the constant BTRFS_UUID_KEY.  Then use the
  item size to determine the number of u64 subids.  Then the item has a
  simple array of u64s in the data which will be a lot easier to work
  with.
 
 Maintaining a 16 bit uuid_type field inside the item is done in order to
 avoid wasting type values for the key. The type field in the key has a
 width of 8 bits, so far 34 type values are defined in ctree.h.
 
 As long as such items are in separated trees, their type value could be
 reused in the future when the 8 bit type field is exhausted.
 
 There was some discussion in #btrfs about this topic on 2013-04-26.
 There are pros and cons. The uuid_type field in the item allows to use
 the uuid items generically outside of the uuid tree since it occupies
 only one type value in the 8 bit type field. On the other hand, if the 8
 bit type field in the key is exhausted, it saves lines of codes and
 avoids complexity to implement a common way to expand this field instead
 of implementing multiplexing layers at multiple places, although maybe
 with some added performance penalty.
 
 Anybody else with an opinion for this topic?

I've been thinking about this, and while I clearly did not have this use
of uuids in mind originally, they have become a key part of the FS.  They
deserve their own bits in the type field.

I know this is not an easy thing to redo, but long term it really looks
worth it to me.

-chris
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 1/8] Btrfs: introduce a tree for items that map UUIDs to something

2013-06-20 Thread Stefan Behrens
Mapping UUIDs to subvolume IDs is an operation with a high effort
today. Today, the algorithm even has quadratic effort (based on the
number of existing subvolumes), which means, that it takes minutes
to send/receive a single subvolume if 10,000 subvolumes exist. But
even linear effort would be too much since it is a waste. And these
data structures to allow mapping UUIDs to subvolume IDs are created
every time a btrfs send/receive instance is started.

It is much more efficient to maintain a searchable persistent data
structure in the filesystem, one that is updated whenever a
subvolume/snapshot is created and deleted, and when the received
subvolume UUID is set by the btrfs-receive tool.

Therefore kernel code is added with this commit that is able to
maintain data structures in the filesystem that allow to quickly
search for a given UUID and to retrieve data that is assigned to
this UUID, like which subvolume ID is related to this UUID.

This commit adds a new tree to hold UUID-to-data mapping items. The
key of the items is the full UUID plus the key type BTRFS_UUID_KEY.
Multiple data blocks can be stored for a given UUID, a type/length/
value scheme is used.

Now follows the lengthy justification, why a new tree was added
instead of using the existing root tree:

The first approach was to not create another tree that holds UUID
items. Instead, the items should just go into the top root tree.
Unfortunately this confused the algorithm to assign the objectid
of subvolumes and snapshots. The reason is that
btrfs_find_free_objectid() calls btrfs_find_highest_objectid() for
the first created subvol or snapshot after mounting a filesystem,
and this function simply searches for the largest used objectid in
the root tree keys to pick the next objectid to assign. Of course,
the UUID keys have always been the ones with the highest offset
value, and the next assigned subvol ID was wastefully huge.

To use any other existing tree did not look proper. To apply a
workaround such as setting the objectid to zero in the UUID item
key and to implement collision handling would either add
limitations (in case of a btrfs_extend_item() approach to handle
the collisions) or a lot of complexity and source code (in case a
key would be looked up that is free of collisions). Adding new code
that introduces limitations is not good, and adding code that is
complex and lengthy for no good reason is also not good. That's the
justification why a completely new tree was introduced.

Signed-off-by: Stefan Behrens sbehr...@giantdisaster.de
---
 fs/btrfs/Makefile|   3 +-
 fs/btrfs/ctree.h |  50 ++
 fs/btrfs/uuid-tree.c | 480 +++
 3 files changed, 532 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3932224..a550dfc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-  reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
+  reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+  uuid-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 76e4983..04447b6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -91,6 +91,9 @@ struct btrfs_ordered_sum;
 /* holds quota configuration and tracking */
 #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
 
+/* for storing items that use the BTRFS_UUID_KEY */
+#define BTRFS_UUID_TREE_OBJECTID 9ULL
+
 /* for storing balance parameters in the root tree */
 #define BTRFS_BALANCE_OBJECTID -4ULL
 
@@ -953,6 +956,18 @@ struct btrfs_dev_replace_item {
__le64 num_uncorrectable_read_errors;
 } __attribute__ ((__packed__));
 
+/* for items that use the BTRFS_UUID_KEY */
+#define BTRFS_UUID_ITEM_TYPE_SUBVOL0 /* for UUIDs assigned to subvols */
+#define BTRFS_UUID_ITEM_TYPE_RECEIVED_SUBVOL   1 /* for UUIDs assigned to
+  * received subvols */
+
+/* a sequence of such items is stored under the BTRFS_UUID_KEY */
+struct btrfs_uuid_item {
+   __le16 type;/* refer to BTRFS_UUID_ITEM_TYPE* defines above */
+   __le32 len; /* number of following 64bit values */
+   __le64 subid[0];/* sequence of subids */
+} __attribute__ ((__packed__));
+
 /* different types of block groups (and chunks) */
 #define BTRFS_BLOCK_GROUP_DATA (1ULL  0)
 #define BTRFS_BLOCK_GROUP_SYSTEM   (1ULL  1)
@@ -1922,6 +1937,17 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_REPLACE_KEY  250
 
 /*
+ * Stores items that allow to quickly map UUIDs to something else.
+ * These 

Re: [PATCH v5 1/8] Btrfs: introduce a tree for items that map UUIDs to something

2013-06-20 Thread Zach Brown

 +/* for items that use the BTRFS_UUID_KEY */
 +#define BTRFS_UUID_ITEM_TYPE_SUBVOL  0 /* for UUIDs assigned to subvols */
 +#define BTRFS_UUID_ITEM_TYPE_RECEIVED_SUBVOL 1 /* for UUIDs assigned to
 +* received subvols */
 +
 +/* a sequence of such items is stored under the BTRFS_UUID_KEY */
 +struct btrfs_uuid_item {
 + __le16 type;/* refer to BTRFS_UUID_ITEM_TYPE* defines above */
 + __le32 len; /* number of following 64bit values */
 + __le64 subid[0];/* sequence of subids */
 +} __attribute__ ((__packed__));
 +

[...]

  /*
 + * Stores items that allow to quickly map UUIDs to something else.
 + * These items are part of the filesystem UUID tree.
 + * The key is built like this:
 + * (UUID_upper_64_bits, BTRFS_UUID_KEY, UUID_lower_64_bits).
 + */
 +#if BTRFS_UUID_SIZE != 16
 +#error UUID items require BTRFS_UUID_SIZE == 16!
 +#endif
 +#define BTRFS_UUID_KEY   251

Why do we need this btrfs_uuid_item structure?  Why not set the key type
to either _SUBVOL or _RECEIVED_SUBVOL instead of embedding structs with
those types under items with the constant BTRFS_UUID_KEY.  Then use the
item size to determine the number of u64 subids.  Then the item has a
simple array of u64s in the data which will be a lot easier to work
with.

 +/* btrfs_uuid_item */
 +BTRFS_SETGET_FUNCS(uuid_type, struct btrfs_uuid_item, type, 16);
 +BTRFS_SETGET_FUNCS(uuid_len, struct btrfs_uuid_item, len, 32);
 +BTRFS_SETGET_STACK_FUNCS(stack_uuid_type, struct btrfs_uuid_item, type, 16);
 +BTRFS_SETGET_STACK_FUNCS(stack_uuid_len, struct btrfs_uuid_item, len, 32);

This would all go away.

 +/*
 + * One key is used to store a sequence of btrfs_uuid_item items.
 + * Each item in the sequence contains a type information and a sequence of
 + * ids (together with the information about the size of the sequence of ids).
 + * {[btrfs_uuid_item type0 {id0, id1, ..., idN}],
 + *  ...,
 + *  [btrfs_uuid_item typeZ {id0, id1, ..., idN}]}
 + *
 + * It is forbidden to put multiple items with the same type under the same 
 key.
 + * Instead the sequence of ids is extended and used to store any additional
 + * ids for the same item type.

This constraint, and the cost of ensuring it and repairing violations,
would go away.

 +static struct btrfs_uuid_item *btrfs_match_uuid_item_type(
 + struct btrfs_path *path, u16 type)
 +{
 + struct extent_buffer *eb;
 + int slot;
 + struct btrfs_uuid_item *ptr;
 + u32 item_size;
 +
 + eb = path-nodes[0];
 + slot = path-slots[0];
 + ptr = btrfs_item_ptr(eb, slot, struct btrfs_uuid_item);
 + item_size = btrfs_item_size_nr(eb, slot);
 + do {
 + u16 sub_item_type;
 + u64 sub_item_len;
 +
 + if (item_size  sizeof(*ptr)) {
 + pr_warn(btrfs: uuid item too short (%lu  %d)!\n,
 + (unsigned long)item_size, (int)sizeof(*ptr));
 + return NULL;
 + }
 + item_size -= sizeof(*ptr);
 + sub_item_type = btrfs_uuid_type(eb, ptr);
 + sub_item_len = btrfs_uuid_len(eb, ptr);
 + if (sub_item_len * sizeof(u64)  item_size) {
 + pr_warn(btrfs: uuid item too short (%llu  %lu)!\n,
 + (unsigned long long)(sub_item_len *
 +  sizeof(u64)),
 + (unsigned long)item_size);
 + return NULL;
 + }
 + if (sub_item_type == type)
 + return ptr;
 + item_size -= sub_item_len * sizeof(u64);
 + ptr = 1 + (struct btrfs_uuid_item *)
 + (((char *)ptr) + (sub_item_len * sizeof(u64)));
 + } while (item_size);

 +static int btrfs_uuid_tree_lookup_prepare(struct btrfs_root *uuid_root,
 +   u8 *uuid, u16 type,
 +   struct btrfs_path *path,
 +   struct btrfs_uuid_item **ptr)
 +{
 + int ret;
 + struct btrfs_key key;
 +
 + if (!uuid_root) {
 + WARN_ON_ONCE(1);
 + ret = -ENOENT;
 + goto out;
 + }
 +
 + btrfs_uuid_to_key(uuid, key);
 +
 + ret = btrfs_search_slot(NULL, uuid_root, key, path, 0, 0);
 + if (ret  0)
 + goto out;
 + if (ret  0) {
 + ret = -ENOENT;
 + goto out;
 + }
 +
 + *ptr = btrfs_match_uuid_item_type(path, type);
 + if (!*ptr) {
 + ret = -ENOENT;
 + goto out;
 + }
 +
 + ret = 0;
 +
 +out:
 + return ret;
 +}

All of this is replaced with the simple search_slot in the caller.

 + offset = (unsigned long)ptr;
 + while (sub_item_len  0) {
 + u64 data;
 +
 + read_extent_buffer(eb, data, offset, sizeof(data));
 + data = le64_to_cpu(data);
 +