OK, accepted. Mikulas
On Tue, 22 Apr 2025, Eric Biggers wrote: > From: Eric Biggers <ebigg...@google.com> > > Every 30 seconds, dm-bufio evicts all buffers that were not accessed > within the last max_age_seconds, except those pinned in memory via > retain_bytes. By default max_age_seconds is 300 (i.e. 5 minutes), and > retain_bytes is 262144 (i.e. 256 KiB) per dm-bufio client. > > This eviction algorithm is much too eager and is also redundant with the > shinker based eviction. > > Testing on an Android phone shows that about 30 MB of dm-bufio buffers > (from dm-verity Merkle tree blocks) are loaded at boot time, and then > about 90% of them are suddenly thrown away 5 minutes after boot. This > results in unnecessary Merkle tree I/O later. > > Meanwhile, if the system actually encounters memory pressure, testing > also shows that the shrinker is effective at evicting the buffers. > > Other major Linux kernel caches, such as the page cache, do not enforce > a maximum age, instead relying on the shrinker. > > For these reasons, Android is now setting max_age_seconds to 86400 > (i.e. 1 day), which mostly disables it; see > https://android.googlesource.com/platform/system/core/+/cadad290a79d5b0a30add935aaadab7c1b1ef5e9%5E%21/ > > That is a much better default, but really the maximum age based eviction > should not exist at all. Let's remove it. > > Note that this also eliminates the need to run work every 30 seconds, > which is beneficial too. > > Signed-off-by: Eric Biggers <ebigg...@google.com> > --- > drivers/md/dm-bufio.c | 189 ++++++++---------------------------------- > 1 file changed, 36 insertions(+), 153 deletions(-) > > diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c > index 9c8ed65cd87e6..8f5ac14bb3b74 100644 > --- a/drivers/md/dm-bufio.c > +++ b/drivers/md/dm-bufio.c > @@ -38,20 +38,10 @@ > #define DM_BUFIO_MEMORY_PERCENT 2 > #define DM_BUFIO_VMALLOC_PERCENT 25 > #define DM_BUFIO_WRITEBACK_RATIO 3 > #define DM_BUFIO_LOW_WATERMARK_RATIO 16 > > -/* > - * Check buffer ages in this interval (seconds) > - */ > -#define DM_BUFIO_WORK_TIMER_SECS 30 > - > -/* > - * Free buffers when they are older than this (seconds) > - */ > -#define DM_BUFIO_DEFAULT_AGE_SECS 300 > - > /* > * The nr of bytes of cached data to keep around. > */ > #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) > > @@ -1053,14 +1043,12 @@ static unsigned long dm_bufio_cache_size; > */ > static unsigned long dm_bufio_cache_size_latch; > > static DEFINE_SPINLOCK(global_spinlock); > > -/* > - * Buffers are freed after this timeout > - */ > -static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; > +static unsigned int dm_bufio_max_age; /* No longer does anything */ > + > static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; > > static unsigned long dm_bufio_peak_allocated; > static unsigned long dm_bufio_allocated_kmem_cache; > static unsigned long dm_bufio_allocated_kmalloc; > @@ -1084,11 +1072,10 @@ static LIST_HEAD(dm_bufio_all_clients); > * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count > */ > static DEFINE_MUTEX(dm_bufio_clients_lock); > > static struct workqueue_struct *dm_bufio_wq; > -static struct delayed_work dm_bufio_cleanup_old_work; > static struct work_struct dm_bufio_replacement_work; > > > #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING > static void buffer_record_stack(struct dm_buffer *b) > @@ -2671,134 +2658,10 @@ void dm_bufio_set_sector_offset(struct > dm_bufio_client *c, sector_t start) > } > EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); > > /*--------------------------------------------------------------*/ > > -static unsigned int get_max_age_hz(void) > -{ > - unsigned int max_age = READ_ONCE(dm_bufio_max_age); > - > - if (max_age > UINT_MAX / HZ) > - max_age = UINT_MAX / HZ; > - > - return max_age * HZ; > -} > - > -static bool older_than(struct dm_buffer *b, unsigned long age_hz) > -{ > - return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz); > -} > - > -struct evict_params { > - gfp_t gfp; > - unsigned long age_hz; > - > - /* > - * This gets updated with the largest last_accessed (ie. most > - * recently used) of the evicted buffers. It will not be reinitialised > - * by __evict_many(), so you can use it across multiple invocations. > - */ > - unsigned long last_accessed; > -}; > - > -/* > - * We may not be able to evict this buffer if IO pending or the client > - * is still using it. > - * > - * And if GFP_NOFS is used, we must not do any I/O because we hold > - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets > - * rerouted to different bufio client. > - */ > -static enum evict_result select_for_evict(struct dm_buffer *b, void *context) > -{ > - struct evict_params *params = context; > - > - if (!(params->gfp & __GFP_FS) || > - (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) { > - if (test_bit_acquire(B_READING, &b->state) || > - test_bit(B_WRITING, &b->state) || > - test_bit(B_DIRTY, &b->state)) > - return ER_DONT_EVICT; > - } > - > - return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP; > -} > - > -static unsigned long __evict_many(struct dm_bufio_client *c, > - struct evict_params *params, > - int list_mode, unsigned long max_count) > -{ > - unsigned long count; > - unsigned long last_accessed; > - struct dm_buffer *b; > - > - for (count = 0; count < max_count; count++) { > - b = cache_evict(&c->cache, list_mode, select_for_evict, params); > - if (!b) > - break; > - > - last_accessed = READ_ONCE(b->last_accessed); > - if (time_after_eq(params->last_accessed, last_accessed)) > - params->last_accessed = last_accessed; > - > - __make_buffer_clean(b); > - __free_buffer_wake(b); > - > - cond_resched(); > - } > - > - return count; > -} > - > -static void evict_old_buffers(struct dm_bufio_client *c, unsigned long > age_hz) > -{ > - struct evict_params params = {.gfp = 0, .age_hz = age_hz, > .last_accessed = 0}; > - unsigned long retain = get_retain_buffers(c); > - unsigned long count; > - LIST_HEAD(write_list); > - > - dm_bufio_lock(c); > - > - __check_watermark(c, &write_list); > - if (unlikely(!list_empty(&write_list))) { > - dm_bufio_unlock(c); > - __flush_write_list(&write_list); > - dm_bufio_lock(c); > - } > - > - count = cache_total(&c->cache); > - if (count > retain) > - __evict_many(c, ¶ms, LIST_CLEAN, count - retain); > - > - dm_bufio_unlock(c); > -} > - > -static void cleanup_old_buffers(void) > -{ > - unsigned long max_age_hz = get_max_age_hz(); > - struct dm_bufio_client *c; > - > - mutex_lock(&dm_bufio_clients_lock); > - > - __cache_size_refresh(); > - > - list_for_each_entry(c, &dm_bufio_all_clients, client_list) > - evict_old_buffers(c, max_age_hz); > - > - mutex_unlock(&dm_bufio_clients_lock); > -} > - > -static void work_fn(struct work_struct *w) > -{ > - cleanup_old_buffers(); > - > - queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, > - DM_BUFIO_WORK_TIMER_SECS * HZ); > -} > - > -/*--------------------------------------------------------------*/ > - > /* > * Global cleanup tries to evict the oldest buffers from across _all_ > * the clients. It does this by repeatedly evicting a few buffers from > * the client that holds the oldest buffer. It's approximate, but hopefully > * good enough. > @@ -2832,31 +2695,55 @@ static void __insert_client(struct dm_bufio_client > *new_client) > } > > list_add_tail(&new_client->client_list, h); > } > > +static enum evict_result select_for_evict(struct dm_buffer *b, void *context) > +{ > + /* In no-sleep mode, we cannot wait on IO. */ > + if (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep) { > + if (test_bit_acquire(B_READING, &b->state) || > + test_bit(B_WRITING, &b->state) || > + test_bit(B_DIRTY, &b->state)) > + return ER_DONT_EVICT; > + } > + return ER_EVICT; > +} > + > static unsigned long __evict_a_few(unsigned long nr_buffers) > { > - unsigned long count; > struct dm_bufio_client *c; > - struct evict_params params = { > - .gfp = GFP_KERNEL, > - .age_hz = 0, > - /* set to jiffies in case there are no buffers in this client */ > - .last_accessed = jiffies > - }; > + unsigned long oldest_buffer = jiffies; > + unsigned long last_accessed; > + unsigned long count; > + struct dm_buffer *b; > > c = __pop_client(); > if (!c) > return 0; > > dm_bufio_lock(c); > - count = __evict_many(c, ¶ms, LIST_CLEAN, nr_buffers); > + > + for (count = 0; count < nr_buffers; count++) { > + b = cache_evict(&c->cache, LIST_CLEAN, select_for_evict, NULL); > + if (!b) > + break; > + > + last_accessed = READ_ONCE(b->last_accessed); > + if (time_after_eq(oldest_buffer, last_accessed)) > + oldest_buffer = last_accessed; > + > + __make_buffer_clean(b); > + __free_buffer_wake(b); > + > + cond_resched(); > + } > + > dm_bufio_unlock(c); > > if (count) > - c->oldest_buffer = params.last_accessed; > + c->oldest_buffer = oldest_buffer; > __insert_client(c); > > return count; > } > > @@ -2935,14 +2822,11 @@ static int __init dm_bufio_init(void) > > dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0); > if (!dm_bufio_wq) > return -ENOMEM; > > - INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn); > INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup); > - queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, > - DM_BUFIO_WORK_TIMER_SECS * HZ); > > return 0; > } > > /* > @@ -2950,11 +2834,10 @@ static int __init dm_bufio_init(void) > */ > static void __exit dm_bufio_exit(void) > { > int bug = 0; > > - cancel_delayed_work_sync(&dm_bufio_cleanup_old_work); > destroy_workqueue(dm_bufio_wq); > > if (dm_bufio_client_count) { > DMCRIT("%s: dm_bufio_client_count leaked: %d", > __func__, dm_bufio_client_count); > @@ -2987,11 +2870,11 @@ module_exit(dm_bufio_exit) > > module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644); > MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); > > module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644); > -MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); > +MODULE_PARM_DESC(max_age_seconds, "No longer does anything"); > > module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644); > MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached > in memory"); > > module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, > 0644); > > base-commit: a33b5a08cbbdd7aadff95f40cbb45ab86841679e > -- > 2.49.0 >