Add logic to free up a busy memory range. Freed memory range will be
returned to free pool. Add a worker which can be started to select
and free some busy memory ranges.
In certain cases (write path), process can steal one of its busy
dax ranges if free range is not available.
If free range is not available and nothing can't be stolen from same
inode, caller waits on a waitq for free range to become available.
For reclaiming a range, as of now we need to hold following locks in
specified order.
inode_trylock(inode);
down_write(>i_mmap_sem);
down_write(>i_dmap_sem);
This means, one can not wait for a range to become free when in fault
path because it can lead to deadlock in following two situations.
- Worker thread to free memory might block on fuse_inode->i_mmap_sem as well.
- This inode is holding all the memory and more memory can't be freed.
In both the cases, deadlock will ensue. So return -ENOSPC from iomap_begin()
in fault path if memory can't be allocated. Drop fuse_inode->i_mmap_sem,
and wait for a free range to become available and retry.
read path can't do direct reclaim as well because it holds shared inode
lock while reclaim assumes that inode lock is held exclusively. Due to
shared lock, it might happen that one reader is still reading from range
and another reader reclaims that range leading to problems. So read path
also returns -ENOSPC and higher layers retry (like fault path).
a different story. We hold inode lock and lock ordering
allows to grab fuse_inode->immap_sem, if needed. That means we can do direct
reclaim in that path. But if there is no memory allocated to this inode,
then direct reclaim will not work and we need to wait for a memory range
to become free. So try following order.
A. Try to get a free range.
B. If not, try direct reclaim.
C. If not, wait for a memory range to become free
Here sleeping with locks held should be fine because in step B, we made
sure this inode is not holding any ranges. That means other inodes are
holding ranges and somebody should be able to free memory. Also, worker
thread does a trylock() on inode lock. That means worker tread will not
wait on this inode and move onto next memory range. Hence above sequence
should be deadlock free.
Signed-off-by: Vivek Goyal
---
fs/fuse/file.c | 357 +--
fs/fuse/fuse_i.h | 22 +++
fs/fuse/inode.c | 4 +
3 files changed, 374 insertions(+), 9 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 3f0f7a387341..87fc2b5e0a3a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -25,6 +25,8 @@
INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, rb, __u64, __subtree_last,
START, LAST, static inline, fuse_dax_interval_tree);
+static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc,
+ struct inode *inode);
static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
int opcode, struct fuse_open_out *outargp)
{
@@ -179,6 +181,7 @@ static void fuse_link_write_file(struct file *file)
static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc)
{
+ unsigned long free_threshold;
struct fuse_dax_mapping *dmap = NULL;
spin_lock(>lock);
@@ -186,7 +189,7 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct
fuse_conn *fc)
/* TODO: Add logic to try to free up memory if wait is allowed */
if (fc->nr_free_ranges <= 0) {
spin_unlock(>lock);
- return NULL;
+ goto out_kick;
}
WARN_ON(list_empty(>free_ranges));
@@ -197,15 +200,43 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct
fuse_conn *fc)
list_del_init(>list);
fc->nr_free_ranges--;
spin_unlock(>lock);
+
+out_kick:
+ /* If number of free ranges are below threshold, start reclaim */
+ free_threshold = max((fc->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD)/100,
+ (unsigned long)1);
+ if (fc->nr_free_ranges < free_threshold) {
+ pr_debug("fuse: Kicking dax memory reclaim worker.
nr_free_ranges=0x%ld nr_total_ranges=%ld\n", fc->nr_free_ranges, fc->nr_ranges);
+ queue_delayed_work(system_long_wq, >dax_free_work, 0);
+ }
return dmap;
}
+/* This assumes fc->lock is held */
+static void __dmap_remove_busy_list(struct fuse_conn *fc,
+ struct fuse_dax_mapping *dmap)
+{
+ list_del_init(>busy_list);
+ WARN_ON(fc->nr_busy_ranges == 0);
+ fc->nr_busy_ranges--;
+}
+
+static void dmap_remove_busy_list(struct fuse_conn *fc,
+ struct fuse_dax_mapping *dmap)
+{
+ spin_lock(>lock);
+ __dmap_remove_busy_list(fc, dmap);
+ spin_unlock(>lock);
+}
+
/* This assumes fc->lock is held */
static void __free_dax_mapping(struct fuse_conn *fc,