OK, Addressed all. 1) Removed unrelated include cleanup (will send that as a follow up). 2) renamed the tail/head (and the functions that link/unlink to match) 3) we are now appending after the tail and iterating from the head 4) added a comment in iterate that explains why is that important.
Kind regards, Milos On Mon, Mar 30, 2026 at 4:56 PM Samuel Thibault <[email protected]> wrote: > > Samuel Thibault, le mar. 31 mars 2026 01:46:53 +0200, a ecrit: > > > + /* Bootstrap the loop by grabbing a ref to the very first node */ > > > > Actually, the very last. Please also explain in a comment why we > > traverse the list from the end. > > I'm actually thinking that it'd be more intuitive for the reader to add > elements at the end of the list, and process it from the start. > > Samuel
From 27021f2a7363588731bd16a776ba18b57a2c0831 Mon Sep 17 00:00:00 2001 From: Milos Nikic <[email protected]> Date: Fri, 27 Mar 2026 23:12:22 -0700 Subject: [PATCH] libdiskfs: Eliminate malloc from node-cache.c iteration. This patch changes how diskfs_node_iterate() iterates the cache content. Nodes contained in the cache now form a doubly linked list in addition to being part of the hashtable. At the cost of adding two pointers to the node structure we are able to add an additional memory efficient way of iterating the cache and thus we can now completely avoid malloc and copy during iteration. Performance improvement becomes more visible the larger the cache is, and it can even become measurable on very large node-cache size. --- libdiskfs/diskfs.h | 4 ++ libdiskfs/node-cache.c | 119 +++++++++++++++++++++++++++-------------- 2 files changed, 83 insertions(+), 40 deletions(-) diff --git a/libdiskfs/diskfs.h b/libdiskfs/diskfs.h index 9a5d5d9d..d95e1b80 100644 --- a/libdiskfs/diskfs.h +++ b/libdiskfs/diskfs.h @@ -125,6 +125,10 @@ struct node loff_t allocsize; ino64_t cache_id; + + /* The Intrusive List Pointers */ + struct node *cache_next; + struct node *cache_prev; }; struct diskfs_control diff --git a/libdiskfs/node-cache.c b/libdiskfs/node-cache.c index 5f6c3e36..79bb45ee 100644 --- a/libdiskfs/node-cache.c +++ b/libdiskfs/node-cache.c @@ -60,6 +60,43 @@ static struct hurd_ihash nodecache = HURD_IHASH_INITIALIZER_GKI (offsetof (struct node, slot), NULL, NULL, hash, compare); static pthread_rwlock_t nodecache_lock = PTHREAD_RWLOCK_INITIALIZER; +static struct node *nodecache_list_head = NULL; +static struct node *nodecache_list_tail = NULL; + +/* Unlinks a node from the active nodes doubly-linked list. + The caller MUST hold nodecache_lock (for writing) before calling this. */ +static void +unlink_list_node (struct node *np) +{ + if (np->cache_prev) + np->cache_prev->cache_next = np->cache_next; + if (np->cache_next) + np->cache_next->cache_prev = np->cache_prev; + + if (nodecache_list_head == np) + nodecache_list_head = np->cache_next; + if (nodecache_list_tail == np) + nodecache_list_tail = np->cache_prev; + + np->cache_prev = NULL; + np->cache_next = NULL; +} + +/* Adds a node to the tail of the active nodes doubly-linked list. + The caller MUST hold nodecache_lock (for writing) before calling this. */ +static void +link_list_node (struct node *np) +{ + np->cache_next = NULL; + np->cache_prev = nodecache_list_tail; + + if (nodecache_list_tail) + nodecache_list_tail->cache_next = np; + else + nodecache_list_head = np; + + nodecache_list_tail = np; +} /* Fetch inode INUM, set *NPP to the node structure; gain one user reference and lock the node. */ @@ -107,6 +144,7 @@ diskfs_cached_lookup_context (ino_t inum, struct node **npp, err = hurd_ihash_locp_add (&nodecache, slot, (hurd_ihash_key_t) &np->cache_id, np); assert_perror_backtrace (err); + link_list_node (np); diskfs_nref_light (np); pthread_rwlock_unlock (&nodecache_lock); @@ -116,6 +154,7 @@ diskfs_cached_lookup_context (ino_t inum, struct node **npp, { pthread_rwlock_wrlock (&nodecache_lock); hurd_ihash_remove (&nodecache, (hurd_ihash_key_t) &np->cache_id); + unlink_list_node (np); pthread_rwlock_unlock (&nodecache_lock); /* Don't delete from disk. */ @@ -180,6 +219,7 @@ diskfs_try_dropping_softrefs (struct node *np) hurd_ihash_locp_remove (&nodecache, np->slot); np->slot = NULL; + unlink_list_node (np); /* Flush node if needed, before forgetting it */ diskfs_node_update (np, diskfs_synchronous); @@ -192,59 +232,58 @@ diskfs_try_dropping_softrefs (struct node *np) /* For each active node, call FUN. The node is to be locked around the call to FUN. If FUN returns non-zero for any node, then immediately stop, and - return that value. */ + return that value. + + We iterate the list forwards (from head to tail). Since new nodes + are appended to the tail, this means we process the oldest nodes first + (FIFO order). This preserves the chronological order of file creation + and modification, which allows the block layer and disk scheduler to + coalesce I/O operations and perform sequential disk writes efficiently. + Iterating backwards (LIFO) would cause severe disk thrashing. */ error_t __attribute__ ((weak)) diskfs_node_iterate (error_t (*fun)(struct node *)) { error_t err = 0; - size_t num_nodes; - struct node *node, **node_list, **p; + struct node *current, *next_node; pthread_rwlock_rdlock (&nodecache_lock); + current = nodecache_list_head; - /* We must copy everything from the hash table into another data structure - to avoid running into any problems with the hash-table being modified - during processing (normally we delegate access to hash-table with - nodecache_lock, but we can't hold this while locking the - individual node locks). */ - /* XXX: Can we? */ - num_nodes = nodecache.nr_items; - - /* TODO This method doesn't scale beyond a few dozen nodes and should be - replaced. */ - node_list = malloc (num_nodes * sizeof (struct node *)); - if (node_list == NULL) - { - pthread_rwlock_unlock (&nodecache_lock); - return ENOMEM; - } + /* Bootstrap the loop by grabbing a ref to the very first node */ + if (current) + refcounts_ref (¤t->refcounts, NULL); - p = node_list; - HURD_IHASH_ITERATE (&nodecache, i) + while (current != NULL) { - *p++ = node = i; + /* Grab a next pointer so it doesn't disappear while we are processing + * 'current' */ + next_node = current->cache_next; + if (next_node) + refcounts_ref (&next_node->refcounts, NULL); - /* We acquire a hard reference for node, but without using - diskfs_nref. We do this so that diskfs_new_hardrefs will not - get called. */ - refcounts_ref (&node->refcounts, NULL); - } - pthread_rwlock_unlock (&nodecache_lock); + pthread_rwlock_unlock (&nodecache_lock); - p = node_list; - while (num_nodes-- > 0) - { - node = *p++; - if (!err) - { - pthread_mutex_lock (&node->lock); - err = (*fun)(node); - pthread_mutex_unlock (&node->lock); - } - diskfs_nrele (node); + pthread_mutex_lock (¤t->lock); + err = (*fun)(current); + pthread_mutex_unlock (¤t->lock); + + /* We are done with 'current', drop the ref we grabbed */ + diskfs_nrele (current); + if (err) + { + /* We don't need to traverse the rest of the list! + Just drop the next_node ref if we grabbed it, and return. */ + if (next_node) + diskfs_nrele (next_node); + return err; + } + + /* Re-acquire the global lock to loop around */ + pthread_rwlock_rdlock (&nodecache_lock); + current = next_node; } - free (node_list); + pthread_rwlock_unlock (&nodecache_lock); return err; } -- 2.53.0
