forgot to add to the subject prefix that this is targeted at the stable-bookworm branch. - sorry for the noise
On Mon, 28 Jul 2025 18:30:41 +0200 Stoiko Ivanov <s.iva...@proxmox.com> wrote: > as requested and argued in: > https://lore.proxmox.com/pve-devel/5f3e46ed-bf99-45e2-b497-fc81dc50d...@proxmox.com/ > > Signed-off-by: Stoiko Ivanov <s.iva...@proxmox.com> > --- > If accepted we'd try to include the backport upstream for 2.2.9 > ...kport-enforce-arc_dnode_limit-to-2.2.patch | 207 ++++++++++++++++++ > debian/patches/series | 1 + > 2 files changed, 208 insertions(+) > create mode 100644 > debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch > > diff --git > a/debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch > b/debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch > new file mode 100644 > index 000000000..26c0dface > --- /dev/null > +++ b/debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch > @@ -0,0 +1,207 @@ > +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 > +From: Stoiko Ivanov <s.iva...@proxmox.com> > +Date: Mon, 28 Jul 2025 15:16:46 +0200 > +Subject: [PATCH] backport enforce arc_dnode_limit to 2.2 > + > +This patch is a backport of a7a144e65 ("enforce arc_dnode_limit") > +for the 2.2 branch. > + > +back-ported from commit a7a144e655850b4160943e4ba315eb9a5dc2b2fe > +working around changes from: > +55427add3 ("Several improvements to ARC shrinking (#16197)") > +5b9f3b766 ("Soften pruning threshold on not evictable metadata") > +which are present in 2.2.3, but not in 2.2.8 > + > +Signed-off-by: Stoiko Ivanov <s.iva...@proxmox.com> > +--- > + include/sys/arc_impl.h | 2 +- > + module/os/linux/zfs/zfs_vfsops.c | 65 ++++++++++++++++++++++++++++++++ > + module/zfs/arc.c | 27 ++++++++----- > + 3 files changed, 83 insertions(+), 11 deletions(-) > + > +diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h > +index > defebe3b2fbbdc8b1c901108f19bde8f12ea2175..36cd83e83358e123980909a903854d573531d4b6 > 100644 > +--- a/include/sys/arc_impl.h > ++++ b/include/sys/arc_impl.h > +@@ -952,7 +952,7 @@ typedef struct arc_sums { > + wmsum_t arcstat_data_size; > + wmsum_t arcstat_metadata_size; > + wmsum_t arcstat_dbuf_size; > +- wmsum_t arcstat_dnode_size; > ++ aggsum_t arcstat_dnode_size; > + wmsum_t arcstat_bonus_size; > + wmsum_t arcstat_l2_hits; > + wmsum_t arcstat_l2_misses; > +diff --git a/module/os/linux/zfs/zfs_vfsops.c > b/module/os/linux/zfs/zfs_vfsops.c > +index > 1f72cce07dd1830e2f5fdff50ef298e05be3013d..da0cda03985e93acfa111efb7d6e9d6637f729cf > 100644 > +--- a/module/os/linux/zfs/zfs_vfsops.c > ++++ b/module/os/linux/zfs/zfs_vfsops.c > +@@ -1179,6 +1179,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) > + return (error); > + } > + > ++/* > ++ * Dentry and inode caches referenced by a task in non-root memcg are > ++ * not going to be scanned by the kernel-provided shrinker. So, if > ++ * kernel prunes nothing, fall back to this manual walk to free dnodes. > ++ * To avoid scanning the same znodes multiple times they are always rotated > ++ * to the end of the z_all_znodes list. New znodes are inserted at the > ++ * end of the list so we're always scanning the oldest znodes first. > ++ */ > ++static int > ++zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) > ++{ > ++ znode_t **zp_array, *zp; > ++ int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); > ++ int objects = 0; > ++ int i = 0, j = 0; > ++ > ++ zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); > ++ > ++ mutex_enter(&zfsvfs->z_znodes_lock); > ++ while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { > ++ > ++ if ((i++ > nr_to_scan) || (j >= max_array)) > ++ break; > ++ > ++ ASSERT(list_link_active(&zp->z_link_node)); > ++ list_remove(&zfsvfs->z_all_znodes, zp); > ++ list_insert_tail(&zfsvfs->z_all_znodes, zp); > ++ > ++ /* Skip active znodes and .zfs entries */ > ++ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) > ++ continue; > ++ > ++ if (igrab(ZTOI(zp)) == NULL) > ++ continue; > ++ > ++ zp_array[j] = zp; > ++ j++; > ++ } > ++ mutex_exit(&zfsvfs->z_znodes_lock); > ++ > ++ for (i = 0; i < j; i++) { > ++ zp = zp_array[i]; > ++ > ++ ASSERT3P(zp, !=, NULL); > ++ d_prune_aliases(ZTOI(zp)); > ++ > ++ if (atomic_read(&ZTOI(zp)->i_count) == 1) > ++ objects++; > ++ > ++ zrele(zp); > ++ } > ++ > ++ vmem_free(zp_array, max_array * sizeof (znode_t *)); > ++ > ++ return (objects); > ++} > ++ > + /* > + * The ARC has requested that the filesystem drop entries from the dentry > + * and inode caches. This can occur when the ARC needs to free meta data > +@@ -1222,6 +1279,14 @@ zfs_prune(struct super_block *sb, unsigned long > nr_to_scan, int *objects) > + *objects = (*shrinker->scan_objects)(shrinker, &sc); > + #endif > + > ++ /* > ++ * Fall back to zfs_prune_aliases if kernel's shrinker did nothing > ++ * due to dentry and inode caches being referenced by a task running > ++ * in non-root memcg. > ++ */ > ++ if (*objects == 0) > ++ *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); > ++ > + zfs_exit(zfsvfs, FTAG); > + > + dprintf_ds(zfsvfs->z_os->os_dsl_dataset, > +diff --git a/module/zfs/arc.c b/module/zfs/arc.c > +index > 5c6e92f0f8b31dbcd569c92e645afb2e180b2deb..383aca2808d2c0aa8d09a9cdc8cfbfde4f6a6fc9 > 100644 > +--- a/module/zfs/arc.c > ++++ b/module/zfs/arc.c > +@@ -2597,7 +2597,7 @@ arc_space_consume(uint64_t space, arc_space_type_t > type) > + ARCSTAT_INCR(arcstat_bonus_size, space); > + break; > + case ARC_SPACE_DNODE: > +- ARCSTAT_INCR(arcstat_dnode_size, space); > ++ aggsum_add(&arc_sums.arcstat_dnode_size, space); > + break; > + case ARC_SPACE_DBUF: > + ARCSTAT_INCR(arcstat_dbuf_size, space); > +@@ -2643,7 +2643,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) > + ARCSTAT_INCR(arcstat_bonus_size, -space); > + break; > + case ARC_SPACE_DNODE: > +- ARCSTAT_INCR(arcstat_dnode_size, -space); > ++ aggsum_add(&arc_sums.arcstat_dnode_size, -space); > + break; > + case ARC_SPACE_DBUF: > + ARCSTAT_INCR(arcstat_dbuf_size, -space); > +@@ -4292,7 +4292,7 @@ arc_evict(void) > + * target is not evictable or if they go over arc_dnode_limit. > + */ > + int64_t prune = 0; > +- int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); > ++ int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size); > + w = wt * (int64_t)(arc_meta >> 16) >> 16; > + if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + > + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - > +@@ -4775,12 +4775,19 @@ arc_is_overflowing(boolean_t use_reserve) > + * in the ARC. In practice, that's in the tens of MB, which is low > + * enough to be safe. > + */ > +- int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - > ++ int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - > + arc_c - overflow / 2; > + if (!use_reserve) > + overflow /= 2; > +- return (over < 0 ? ARC_OVF_NONE : > +- over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); > ++ > ++ int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) - > ++ arc_dnode_limit; > ++ > ++ /* Always allow at least one block of overflow. */ > ++ if (arc_over < 0 && dn_over <= 0) > ++ return (ARC_OVF_NONE); > ++ > ++ return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); > + } > + > + static abd_t * > +@@ -6938,7 +6945,7 @@ arc_kstat_update(kstat_t *ksp, int rw) > + #if defined(COMPAT_FREEBSD11) > + as->arcstat_other_size.value.ui64 = > + wmsum_value(&arc_sums.arcstat_bonus_size) + > +- wmsum_value(&arc_sums.arcstat_dnode_size) + > ++ aggsum_value(&arc_sums.arcstat_dnode_size) + > + wmsum_value(&arc_sums.arcstat_dbuf_size); > + #endif > + > +@@ -6980,7 +6987,7 @@ arc_kstat_update(kstat_t *ksp, int rw) > + &as->arcstat_uncached_evictable_metadata); > + > + as->arcstat_dnode_size.value.ui64 = > +- wmsum_value(&arc_sums.arcstat_dnode_size); > ++ aggsum_value(&arc_sums.arcstat_dnode_size); > + as->arcstat_bonus_size.value.ui64 = > + wmsum_value(&arc_sums.arcstat_bonus_size); > + as->arcstat_l2_hits.value.ui64 = > +@@ -7349,7 +7356,7 @@ arc_state_init(void) > + wmsum_init(&arc_sums.arcstat_data_size, 0); > + wmsum_init(&arc_sums.arcstat_metadata_size, 0); > + wmsum_init(&arc_sums.arcstat_dbuf_size, 0); > +- wmsum_init(&arc_sums.arcstat_dnode_size, 0); > ++ aggsum_init(&arc_sums.arcstat_dnode_size, 0); > + wmsum_init(&arc_sums.arcstat_bonus_size, 0); > + wmsum_init(&arc_sums.arcstat_l2_hits, 0); > + wmsum_init(&arc_sums.arcstat_l2_misses, 0); > +@@ -7507,7 +7514,7 @@ arc_state_fini(void) > + wmsum_fini(&arc_sums.arcstat_data_size); > + wmsum_fini(&arc_sums.arcstat_metadata_size); > + wmsum_fini(&arc_sums.arcstat_dbuf_size); > +- wmsum_fini(&arc_sums.arcstat_dnode_size); > ++ aggsum_fini(&arc_sums.arcstat_dnode_size); > + wmsum_fini(&arc_sums.arcstat_bonus_size); > + wmsum_fini(&arc_sums.arcstat_l2_hits); > + wmsum_fini(&arc_sums.arcstat_l2_misses); > diff --git a/debian/patches/series b/debian/patches/series > index 229027ff9..11a97debd 100644 > --- a/debian/patches/series > +++ b/debian/patches/series > @@ -9,3 +9,4 @@ > 0009-arc-stat-summary-guard-access-to-freshly-introduced-.patch > 0010-Fix-nfs_truncate_shares-without-etc-exports.d.patch > 0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch > +0012-backport-enforce-arc_dnode_limit-to-2.2.patch _______________________________________________ pve-devel mailing list pve-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel