https://gitlab.com/vadorovsky/btrfs-perf
https://github.com/mrostecki/btrfs-perf
Signed-off-by: Michal Rostecki <mroste...@suse.com>
---
fs/btrfs/ctree.h | 3 +
fs/btrfs/disk-io.c | 3 +
fs/btrfs/sysfs.c | 93 +++++++++++++++++++++++++++++-
fs/btrfs/volumes.c | 137 ++++++++++++++++++++++++++++++++++++++++++++-
fs/btrfs/volumes.h | 10 ++++
5 files changed, 242 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a9b0521d9e89..6ff0a18fd219 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -976,6 +976,9 @@ struct btrfs_fs_info {
/* Max size to emit ZONE_APPEND write command */
u64 max_zone_append_size;
+ /* Last mirror picked in round-robin selection */
+ int __percpu *last_mirror;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 71fab77873a5..937fcadbdd2f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1547,6 +1547,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
+ free_percpu(fs_info->last_mirror);
kvfree(fs_info);
}
@@ -2857,6 +2858,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->swapfile_pins = RB_ROOT;
fs_info->send_in_progress = 0;
+
+ fs_info->last_mirror = __alloc_percpu(sizeof(int), __alignof__(int));
}
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a8f528eb4e50..b9a6d38843ef 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -928,7 +928,7 @@ static bool strmatch(const char *buffer, const char *string)
return false;
}
-static const char * const btrfs_read_policy_name[] = { "pid" };
+static const char * const btrfs_read_policy_name[] = { "pid", "roundrobin" };
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -976,8 +976,99 @@ static ssize_t btrfs_read_policy_store(struct kobject
*kobj,
}
BTRFS_ATTR_RW(read_policies, policy, btrfs_read_policy_show,
btrfs_read_policy_store);
+static ssize_t btrfs_roundrobin_nonlocal_inc_mixed_only_show(
+ struct kobject *kobj, struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj->parent);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+
READ_ONCE(fs_devices->roundrobin_nonlocal_inc_mixed_only));
+}
+
+static ssize_t btrfs_roundrobin_nonlocal_inc_mixed_only_store(
+ struct kobject *kobj, struct kobj_attribute *a, const char *buf,
+ size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj->parent);
+ bool val;
+ int ret;
+
+ ret = kstrtobool(buf, &val);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(fs_devices->roundrobin_nonlocal_inc_mixed_only, val);
+ return len;
+}
+BTRFS_ATTR_RW(read_policies, roundrobin_nonlocal_inc_mixed_only,
+ btrfs_roundrobin_nonlocal_inc_mixed_only_show,
+ btrfs_roundrobin_nonlocal_inc_mixed_only_store);
+
+static ssize_t btrfs_roundrobin_nonrot_nonlocal_inc_show(struct kobject *kobj,
+ struct kobj_attribute
*a,
+ char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj->parent);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+ READ_ONCE(fs_devices->roundrobin_nonrot_nonlocal_inc));
+}
+
+static ssize_t btrfs_roundrobin_nonrot_nonlocal_inc_store(struct kobject *kobj,
+ struct kobj_attribute
*a,
+ const char *buf,
+ size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj->parent);
+ u32 val;
+ int ret;
+
+ ret = kstrtou32(buf, 10, &val);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(fs_devices->roundrobin_nonrot_nonlocal_inc, val);
+ return len;
+}
+BTRFS_ATTR_RW(read_policies, roundrobin_nonrot_nonlocal_inc,
+ btrfs_roundrobin_nonrot_nonlocal_inc_show,
+ btrfs_roundrobin_nonrot_nonlocal_inc_store);
+
+static ssize_t btrfs_roundrobin_rot_nonlocal_inc_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj->parent);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+ READ_ONCE(fs_devices->roundrobin_rot_nonlocal_inc));
+}
+
+static ssize_t btrfs_roundrobin_rot_nonlocal_inc_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf,
+ size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj->parent);
+ u32 val;
+ int ret;
+
+ ret = kstrtou32(buf, 10, &val);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(fs_devices->roundrobin_rot_nonlocal_inc, val);
+ return len;
+}
+BTRFS_ATTR_RW(read_policies, roundrobin_rot_nonlocal_inc,
+ btrfs_roundrobin_rot_nonlocal_inc_show,
+ btrfs_roundrobin_rot_nonlocal_inc_store);
+
static const struct attribute *read_policies_attrs[] = {
BTRFS_ATTR_PTR(read_policies, policy),
+ BTRFS_ATTR_PTR(read_policies, roundrobin_nonlocal_inc_mixed_only),
+ BTRFS_ATTR_PTR(read_policies, roundrobin_nonrot_nonlocal_inc),
+ BTRFS_ATTR_PTR(read_policies, roundrobin_rot_nonlocal_inc),
NULL,
};
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1ad30a595722..c6dd393190f6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1265,6 +1265,11 @@ static int open_fs_devices(struct btrfs_fs_devices
*fs_devices,
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ fs_devices->roundrobin_nonlocal_inc_mixed_only = true;
+ fs_devices->roundrobin_nonrot_nonlocal_inc =
+ BTRFS_DEFAULT_ROUNDROBIN_NONROT_NONLOCAL_INC;
+ fs_devices->roundrobin_rot_nonlocal_inc =
+ BTRFS_DEFAULT_ROUNDROBIN_ROT_NONLOCAL_INC;
return 0;
}
@@ -5555,8 +5560,125 @@ static u64 stripe_physical(struct map_lookup *map, u32
stripe_index,
stripe_nr * map->stripe_len;
}
+/*
+ * Calculates the load of the given mirror. Load is defines as the number of
+ * inflight requests + potential penalty value.
+ *
+ * @fs_info: the filesystem
+ * @map: mapping containing the logical extent
+ * @mirror_index: number of mirror to check
+ * @stripe_offset: offset of the block in its stripe
+ * @stripe_nr: index of the stripe whete the block falls in
+ */
+static int mirror_load(struct btrfs_fs_info *fs_info, struct map_lookup *map,
+ int mirror_index, u64 stripe_offset, u64 stripe_nr)
+{
+ struct btrfs_fs_devices *fs_devices;
+ struct btrfs_device *dev;
+ int last_offset;
+ u64 physical;
+ int load;
+
+ dev = map->stripes[mirror_index].dev;
+ load = percpu_counter_sum(&dev->inflight);
+ last_offset = atomic_read(&dev->last_offset);
+ physical = stripe_physical(map, mirror_index, stripe_offset, stripe_nr);
+
+ fs_devices = fs_info->fs_devices;
+
+ /*
+ * If the filesystem has mixed type of devices (or we enable adding a
+ * penalty value regardless) and the request is non-local, add a
+ * penalty value.
+ */
+ if ((!fs_devices->roundrobin_nonlocal_inc_mixed_only ||
+ fs_devices->mixed) && last_offset != physical) {
+ if (dev->rotating)
+ return load + fs_devices->roundrobin_rot_nonlocal_inc;
+ return load + fs_devices->roundrobin_nonrot_nonlocal_inc;
+ }
+
+ return load;
+}
+
+/*
+ * Checks if the given mirror can process more requests.
+ *
+ * @fs_info: the filesystem
+ * @map: mapping containing the logical extent
+ * @mirror_index: index of the mirror to check
+ * @stripe_offset: offset of the block in its stripe
+ * @stripe_nr: index of the stripe whete the block falls in
+ *
+ * Returns true if more requests can be processes, otherwise returns false.
+ */
+static bool mirror_queue_not_filled(struct btrfs_fs_info *fs_info,
+ struct map_lookup *map, int mirror_index,
+ u64 stripe_offset, u64 stripe_nr)
+{
+ struct block_device *bdev;
+ unsigned int queue_depth;
+ int inflight;
+
+ bdev = map->stripes[mirror_index].dev->bdev;
+ inflight = mirror_load(fs_info, map, mirror_index, stripe_offset,
+ stripe_nr);
+ queue_depth = blk_queue_depth(bdev->bd_disk->queue);
+
+ return inflight < queue_depth;
+}
+
+/*
+ * Find a mirror using the round-robin technique which has lower load than
+ * queue depth. Load is defined as the number of inflight requests + potential
+ * penalty value.
+ *
+ * @fs_info: the filesystem
+ * @map: mapping containing the logical extent
+ * @first: index of the first device in the stripes array
+ * @num_stripes: number of stripes in the stripes array
+ * @stripe_offset: offset of the block in its stripe
+ * @stripe_nr: index of the stripe whete the block falls in
+ *
+ * Returns the index of selected mirror.
+ */
+static int find_live_mirror_roundrobin(struct btrfs_fs_info *fs_info,
+ struct map_lookup *map, int first,
+ int num_stripes, u64 stripe_offset,
+ u64 stripe_nr)
+{
+ int preferred_mirror;
+ int last_mirror;
+ int i;
+
+ last_mirror = this_cpu_read(*fs_info->last_mirror);
+
+ for (i = last_mirror; i < first + num_stripes; i++) {
+ if (mirror_queue_not_filled(fs_info, map, i, stripe_offset,
+ stripe_nr)) {
+ preferred_mirror = i;
+ goto out;
+ }
+ }
+
+ for (i = first; i < last_mirror; i++) {
+ if (mirror_queue_not_filled(fs_info, map, i, stripe_offset,
+ stripe_nr)) {
+ preferred_mirror = i;
+ goto out;
+ }
+ }
+
+ preferred_mirror = last_mirror;
+
+out:
+ this_cpu_write(*fs_info->last_mirror, preferred_mirror);
+ return preferred_mirror;
+}
+
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first,
+ u64 stripe_offset, u64 stripe_nr,
int dev_replace_is_ongoing)
{
int i;
@@ -5584,6 +5706,11 @@ static int find_live_mirror(struct btrfs_fs_info
*fs_info,
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
break;
+ case BTRFS_READ_POLICY_ROUNDROBIN:
+ preferred_mirror = find_live_mirror_roundrobin(
+ fs_info, map, first, num_stripes, stripe_offset,
+ stripe_nr);
+ break;
}
if (dev_replace_is_ongoing &&
@@ -6178,7 +6305,9 @@ static int __btrfs_map_block(struct btrfs_fs_info
*fs_info,
stripe_index = mirror_num - 1;
else {
stripe_index = find_live_mirror(fs_info, map, 0,
- dev_replace_is_ongoing);
+ stripe_offset,
+ stripe_nr,
+ dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
}
@@ -6204,8 +6333,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
else {
int old_stripe_index = stripe_index;
stripe_index = find_live_mirror(fs_info, map,
- stripe_index,
- dev_replace_is_ongoing);
+ stripe_index,
+ stripe_offset,
+ stripe_nr,
+ dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ee050fd48042..47ca47b60ea9 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -230,9 +230,15 @@ enum btrfs_chunk_allocation_policy {
enum btrfs_read_policy {
/* Use process PID to choose the stripe */
BTRFS_READ_POLICY_PID,
+ /* Round robin */
+ BTRFS_READ_POLICY_ROUNDROBIN,
BTRFS_NR_READ_POLICY,
};
+/* Default raid1 policies config */
+#define BTRFS_DEFAULT_ROUNDROBIN_NONROT_NONLOCAL_INC 0
+#define BTRFS_DEFAULT_ROUNDROBIN_ROT_NONLOCAL_INC 1
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
u8 metadata_uuid[BTRFS_FSID_SIZE];
@@ -294,6 +300,10 @@ struct btrfs_fs_devices {
/* Policy used to read the mirrored stripes */
enum btrfs_read_policy read_policy;
+ /* Policies config */
+ bool roundrobin_nonlocal_inc_mixed_only;
+ u32 roundrobin_nonrot_nonlocal_inc;
+ u32 roundrobin_rot_nonlocal_inc;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64