Author: smh
Date: Fri Oct 10 00:12:16 2014
New Revision: 272875
URL: https://svnweb.freebsd.org/changeset/base/272875

Log:
  MFC r270759:
  Refactor ZFS ARC reclaim logic to be more VM cooperative
  
  MFC r270861:
  Ensure that ZFS ARC free memory checks include cached pages
  
  MFC r272483:
  Refactor ZFS ARC reclaim checks and limits
  
  Sponsored by: Multiplay

Modified:
  stable/10/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
  stable/10/sys/cddl/compat/opensolaris/sys/kmem.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  stable/10/sys/vm/vm_pageout.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
==============================================================================
--- stable/10/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c       Thu Oct 
 9 23:52:33 2014        (r272874)
+++ stable/10/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c       Fri Oct 
10 00:12:16 2014        (r272875)
@@ -133,13 +133,6 @@ kmem_size(void)
        return (kmem_size_val);
 }
 
-uint64_t
-kmem_used(void)
-{
-
-       return (vmem_size(kmem_arena, VMEM_ALLOC));
-}
-
 static int
 kmem_std_constructor(void *mem, int size __unused, void *private, int flags)
 {

Modified: stable/10/sys/cddl/compat/opensolaris/sys/kmem.h
==============================================================================
--- stable/10/sys/cddl/compat/opensolaris/sys/kmem.h    Thu Oct  9 23:52:33 
2014        (r272874)
+++ stable/10/sys/cddl/compat/opensolaris/sys/kmem.h    Fri Oct 10 00:12:16 
2014        (r272875)
@@ -66,7 +66,6 @@ typedef struct kmem_cache {
 void *zfs_kmem_alloc(size_t size, int kmflags);
 void zfs_kmem_free(void *buf, size_t size);
 uint64_t kmem_size(void);
-uint64_t kmem_used(void);
 kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align,
     int (*constructor)(void *, void *, int), void (*destructor)(void *, void 
*),
     void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags);
@@ -78,6 +77,9 @@ void kmem_reap(void);
 int kmem_debugging(void);
 void *calloc(size_t n, size_t s);
 
+#define        freemem                         (cnt.v_free_count + 
cnt.v_cache_count)
+#define        minfree                         cnt.v_free_min
+#define        heap_arena                      kmem_arena
 #define        kmem_alloc(size, kmflags)       zfs_kmem_alloc((size), 
(kmflags))
 #define        kmem_zalloc(size, kmflags)      zfs_kmem_alloc((size), 
(kmflags) | M_ZERO)
 #define        kmem_free(buf, size)            zfs_kmem_free((buf), (size))

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c      Thu Oct 
 9 23:52:33 2014        (r272874)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c      Fri Oct 
10 00:12:16 2014        (r272875)
@@ -138,6 +138,7 @@
 #include <sys/sdt.h>
 
 #include <vm/vm_pageout.h>
+#include <machine/vmparam.h>
 
 #ifdef illumos
 #ifndef _KERNEL
@@ -193,9 +194,6 @@ extern int zfs_prefetch_disable;
  */
 static boolean_t arc_warm;
 
-/*
- * These tunables are for performance analysis.
- */
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
@@ -204,6 +202,19 @@ int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
 int zfs_disable_dup_eviction = 0;
 uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+u_int zfs_arc_free_target = 0;
+
+static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
+
+#ifdef _KERNEL
+static void
+arc_free_target_init(void *unused __unused)
+{
+
+       zfs_arc_free_target = vm_pageout_wakeup_thresh;
+}
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
+    arc_free_target_init, NULL);
 
 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
@@ -217,6 +228,36 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
     &zfs_arc_average_blocksize, 0,
     "ARC average blocksize");
+/*
+ * We don't have a tunable for arc_free_target due to the dependency on
+ * pagedaemon initialisation.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
+    sysctl_vfs_zfs_arc_free_target, "IU",
+    "Desired number of free pages below which ARC triggers reclaim");
+
+static int
+sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
+{
+       u_int val;
+       int err;
+
+       val = zfs_arc_free_target;
+       err = sysctl_handle_int(oidp, &val, 0, req);
+       if (err != 0 || req->newptr == NULL)
+               return (err);
+
+       if (val < minfree)
+               return (EINVAL);
+       if (val > cnt.v_page_count)
+               return (EINVAL);
+
+       zfs_arc_free_target = val;
+
+       return (0);
+}
+#endif
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -2421,9 +2462,12 @@ arc_flush(spa_t *spa)
 void
 arc_shrink(void)
 {
+
        if (arc_c > arc_c_min) {
                uint64_t to_free;
 
+               DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
+                       arc_c_min, uint64_t, arc_p, uint64_t, to_free);
 #ifdef _KERNEL
                to_free = arc_c >> arc_shrink_shift;
 #else
@@ -2439,12 +2483,19 @@ arc_shrink(void)
                        arc_c = MAX(arc_size, arc_c_min);
                if (arc_p > arc_c)
                        arc_p = (arc_c >> 1);
+
+               DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
+                       arc_p);
+
                ASSERT(arc_c >= arc_c_min);
                ASSERT((int64_t)arc_p >= 0);
        }
 
-       if (arc_size > arc_c)
+       if (arc_size > arc_c) {
+               DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
+                       uint64_t, arc_c);
                arc_adjust();
+       }
 }
 
 static int needfree = 0;
@@ -2455,15 +2506,20 @@ arc_reclaim_needed(void)
 
 #ifdef _KERNEL
 
-       if (needfree)
+       if (needfree) {
+               DTRACE_PROBE(arc__reclaim_needfree);
                return (1);
+       }
 
        /*
         * Cooperate with pagedaemon when it's time for it to scan
         * and reclaim some pages.
         */
-       if (vm_paging_needed())
+       if (freemem < zfs_arc_free_target) {
+               DTRACE_PROBE2(arc__reclaim_freemem, uint64_t,
+                   freemem, uint64_t, zfs_arc_free_target);
                return (1);
+       }
 
 #ifdef sun
        /*
@@ -2491,7 +2547,18 @@ arc_reclaim_needed(void)
        if (availrmem < swapfs_minfree + swapfs_reserve + extra)
                return (1);
 
-#if defined(__i386)
+       /*
+        * Check that we have enough availrmem that memory locking (e.g., via
+        * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
+        * stores the number of pages that cannot be locked; when availrmem
+        * drops below pages_pp_maximum, page locking mechanisms such as
+        * page_pp_lock() will fail.)
+        */
+       if (availrmem <= pages_pp_maximum)
+               return (1);
+
+#endif /* sun */
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
        /*
         * If we're on an i386 platform, it's possible that we'll exhaust the
         * kernel heap space before we ever run out of available physical
@@ -2503,32 +2570,49 @@ arc_reclaim_needed(void)
         * heap is allocated.  (Or, in the calculation, if less than 1/4th is
         * free)
         */
-       if (btop(vmem_size(heap_arena, VMEM_FREE)) <
-           (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
+       if (vmem_size(heap_arena, VMEM_FREE) <
+           (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
+               DTRACE_PROBE2(arc__reclaim_used, uint64_t,
+                   vmem_size(heap_arena, VMEM_FREE), uint64_t,
+                   (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
                return (1);
+       }
 #endif
-#else  /* !sun */
-       if (kmem_used() > (kmem_size() * 3) / 4)
+#ifdef sun
+       /*
+        * If zio data pages are being allocated out of a separate heap segment,
+        * then enforce that the size of available vmem for this arena remains
+        * above about 1/16th free.
+        *
+        * Note: The 1/16th arena free requirement was put in place
+        * to aggressively evict memory from the arc in order to avoid
+        * memory fragmentation issues.
+        */
+       if (zio_arena != NULL &&
+           vmem_size(zio_arena, VMEM_FREE) <
+           (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
                return (1);
 #endif /* sun */
-
-#else
+#else  /* _KERNEL */
        if (spa_get_random(100) == 0)
                return (1);
-#endif
+#endif /* _KERNEL */
+       DTRACE_PROBE(arc__reclaim_no);
+
        return (0);
 }
 
 extern kmem_cache_t    *zio_buf_cache[];
 extern kmem_cache_t    *zio_data_buf_cache[];
 
-static void
+static void __noinline
 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 {
        size_t                  i;
        kmem_cache_t            *prev_cache = NULL;
        kmem_cache_t            *prev_data_cache = NULL;
 
+       DTRACE_PROBE(arc__kmem_reap_start);
 #ifdef _KERNEL
        if (arc_meta_used >= arc_meta_limit) {
                /*
@@ -2564,6 +2648,16 @@ arc_kmem_reap_now(arc_reclaim_strategy_t
        }
        kmem_cache_reap_now(buf_cache);
        kmem_cache_reap_now(hdr_cache);
+
+#ifdef sun
+       /*
+        * Ask the vmem arena to reclaim unused memory from its
+        * quantum caches.
+        */
+       if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
+               vmem_qcache_reap(zio_arena);
+#endif
+       DTRACE_PROBE(arc__kmem_reap_end);
 }
 
 static void
@@ -2581,6 +2675,7 @@ arc_reclaim_thread(void *dummy __unused)
 
                        if (arc_no_grow) {
                                if (last_reclaim == ARC_RECLAIM_CONS) {
+                                       DTRACE_PROBE(arc__reclaim_aggr_no_grow);
                                        last_reclaim = ARC_RECLAIM_AGGR;
                                } else {
                                        last_reclaim = ARC_RECLAIM_CONS;
@@ -2588,6 +2683,7 @@ arc_reclaim_thread(void *dummy __unused)
                        } else {
                                arc_no_grow = TRUE;
                                last_reclaim = ARC_RECLAIM_AGGR;
+                               DTRACE_PROBE(arc__reclaim_aggr);
                                membar_producer();
                        }
 
@@ -2692,6 +2788,7 @@ arc_adapt(int bytes, arc_state_t *state)
         * cache size, increment the target cache size
         */
        if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+               DTRACE_PROBE1(arc__inc_adapt, int, bytes);
                atomic_add_64(&arc_c, (int64_t)bytes);
                if (arc_c > arc_c_max)
                        arc_c = arc_c_max;
@@ -2713,20 +2810,6 @@ arc_evict_needed(arc_buf_contents_t type
        if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
                return (1);
 
-#ifdef sun
-#ifdef _KERNEL
-       /*
-        * If zio data pages are being allocated out of a separate heap segment,
-        * then enforce that the size of available vmem for this area remains
-        * above about 1/32nd free.
-        */
-       if (type == ARC_BUFC_DATA && zio_arena != NULL &&
-           vmem_size(zio_arena, VMEM_FREE) <
-           (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
-               return (1);
-#endif
-#endif /* sun */
-
        if (arc_reclaim_needed())
                return (1);
 
@@ -3885,20 +3968,16 @@ static int
 arc_memory_throttle(uint64_t reserve, uint64_t txg)
 {
 #ifdef _KERNEL
-       uint64_t available_memory =
-           ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
+       uint64_t available_memory = ptob(freemem);
        static uint64_t page_load = 0;
        static uint64_t last_txg = 0;
 
-#ifdef sun
-#if defined(__i386)
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
        available_memory =
-           MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
+           MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
 #endif
-#endif /* sun */
 
-       if (cnt.v_free_count + cnt.v_cache_count >
-           (uint64_t)physmem * arc_lotsfree_percent / 100)
+       if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
                return (0);
 
        if (txg > last_txg) {
@@ -3911,7 +3990,7 @@ arc_memory_throttle(uint64_t reserve, ui
         * continue to let page writes occur as quickly as possible.
         */
        if (curproc == pageproc) {
-               if (page_load > available_memory / 4)
+               if (page_load > MAX(ptob(minfree), available_memory) / 4)
                        return (SET_ERROR(ERESTART));
                /* Note: reserve is inflated, so we deflate */
                page_load += reserve / 8;
@@ -3939,8 +4018,10 @@ arc_tempreserve_space(uint64_t reserve, 
        int error;
        uint64_t anon_size;
 
-       if (reserve > arc_c/4 && !arc_no_grow)
+       if (reserve > arc_c/4 && !arc_no_grow) {
                arc_c = MIN(arc_c_max, reserve * 4);
+               DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
+       }
        if (reserve > arc_c)
                return (SET_ERROR(ENOMEM));
 
@@ -3994,6 +4075,7 @@ arc_lowmem(void *arg __unused, int howto
        mutex_enter(&arc_lowmem_lock);
        mutex_enter(&arc_reclaim_thr_lock);
        needfree = 1;
+       DTRACE_PROBE(arc__needfree);
        cv_signal(&arc_reclaim_thr_cv);
 
        /*

Modified: stable/10/sys/vm/vm_pageout.c
==============================================================================
--- stable/10/sys/vm/vm_pageout.c       Thu Oct  9 23:52:33 2014        
(r272874)
+++ stable/10/sys/vm/vm_pageout.c       Fri Oct 10 00:12:16 2014        
(r272875)
@@ -76,6 +76,7 @@
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
+#include "opt_kdtrace.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -89,6 +90,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/vnode.h>
@@ -115,10 +117,14 @@ __FBSDID("$FreeBSD$");
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
+static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t);
 static void vm_pageout_scan(struct vm_domain *vmd, int pass);
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
 
+SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
+    NULL);
+
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
@@ -126,9 +132,13 @@ static struct kproc_desc page_kp = {
        vm_pageout,
        &pageproc
 };
-SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start,
+SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
     &page_kp);
 
+SDT_PROVIDER_DEFINE(vm);
+SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache);
+SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
+
 #if !defined(NO_SWAPPING)
 /* the kernel process "vm_daemon"*/
 static void vm_daemon(void);
@@ -663,6 +673,7 @@ vm_pageout_grow_cache(int tries, vm_padd
                 * may acquire locks and/or sleep, so they can only be invoked
                 * when "tries" is greater than zero.
                 */
+               SDT_PROBE0(vm, , , vm__lowmem_cache);
                EVENTHANDLER_INVOKE(vm_lowmem, 0);
 
                /*
@@ -925,6 +936,7 @@ vm_pageout_scan(struct vm_domain *vmd, i
                /*
                 * Decrease registered cache sizes.
                 */
+               SDT_PROBE0(vm, , , vm__lowmem_scan);
                EVENTHANDLER_INVOKE(vm_lowmem, 0);
                /*
                 * We do this explicitly after the caches have been
@@ -1650,15 +1662,11 @@ vm_pageout_worker(void *arg)
 }
 
 /*
- *     vm_pageout is the high level pageout daemon.
+ *     vm_pageout_init initialises basic pageout daemon settings.
  */
 static void
-vm_pageout(void)
+vm_pageout_init(void)
 {
-#if MAXMEMDOM > 1
-       int error, i;
-#endif
-
        /*
         * Initialize some paging parameters.
         */
@@ -1704,6 +1712,17 @@ vm_pageout(void)
        /* XXX does not really belong here */
        if (vm_page_max_wired == 0)
                vm_page_max_wired = cnt.v_free_count / 3;
+}
+
+/*
+ *     vm_pageout is the high level pageout daemon.
+ */
+static void
+vm_pageout(void)
+{
+#if MAXMEMDOM > 1
+       int error, i;
+#endif
 
        swap_pager_swap_init();
 #if MAXMEMDOM > 1
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to