Author: jeff
Date: Fri Jan 12 22:48:23 2018
New Revision: 327895
URL: https://svnweb.freebsd.org/changeset/base/327895

Log:
  Implement 'domainset', a cpuset based NUMA policy mechanism.  This allows
  userspace to control NUMA policy administratively and programmatically.
  
  Implement domainset based iterators in the page layer.
  
  Remove the now legacy numa_* syscalls.
  
  Cleanup some header polution created by having seq.h in proc.h.
  
  Reviewed by:  markj, kib
  Discussed with:       alc
  Tested by:    pho
  Sponsored by: Netflix, Dell/EMC Isilon
  Differential Revision:        https://reviews.freebsd.org/D13403

Deleted:
  head/sys/kern/kern_numa.c
  head/sys/sys/_vm_domain.h
  head/sys/vm/vm_domain.c
  head/sys/vm/vm_domain.h
Modified:
  head/lib/libc/sys/Symbol.map
  head/sys/arm/arm/machdep_ptrace.c
  head/sys/compat/freebsd32/freebsd32_misc.c
  head/sys/compat/freebsd32/syscalls.master
  head/sys/conf/files
  head/sys/ddb/db_run.c
  head/sys/kern/init_main.c
  head/sys/kern/init_sysent.c
  head/sys/kern/kern_cpuset.c
  head/sys/kern/kern_exit.c
  head/sys/kern/kern_fork.c
  head/sys/kern/kern_thr.c
  head/sys/kern/kern_thread.c
  head/sys/kern/makesyscalls.sh
  head/sys/kern/sched_4bsd.c
  head/sys/kern/sched_ule.c
  head/sys/kern/subr_kdb.c
  head/sys/kern/syscalls.master
  head/sys/netpfil/ipfw/dn_sched_fq_codel.c
  head/sys/sys/cpuset.h
  head/sys/sys/proc.h
  head/sys/sys/syscallsubr.h
  head/sys/vm/vm_fault.c
  head/sys/vm/vm_object.c
  head/sys/vm/vm_object.h
  head/sys/vm/vm_page.c
  head/sys/vm/vm_page.h
  head/sys/vm/vm_phys.c
  head/sys/vm/vm_phys.h
  head/sys/x86/acpica/srat.c
  head/usr.bin/cpuset/cpuset.c
  head/usr.bin/numactl/numactl.c

Modified: head/lib/libc/sys/Symbol.map
==============================================================================
--- head/lib/libc/sys/Symbol.map        Fri Jan 12 21:50:18 2018        
(r327894)
+++ head/lib/libc/sys/Symbol.map        Fri Jan 12 22:48:23 2018        
(r327895)
@@ -398,6 +398,8 @@ FBSD_1.5 {
        mknodat;
        stat;
        statfs;
+       cpuset_getdomain;
+       cpuset_setdomain;
 };
 
 FBSDprivate_1.0 {
@@ -1022,4 +1024,8 @@ FBSDprivate_1.0 {
        gssd_syscall;
        __libc_interposing_slot;
        __libc_sigwait;
+       _cpuset_getdomain;
+       __sys_cpuset_getdomain;
+       _cpuset_setdomain;
+       __sys_cpuset_setdomain;
 };

Modified: head/sys/arm/arm/machdep_ptrace.c
==============================================================================
--- head/sys/arm/arm/machdep_ptrace.c   Fri Jan 12 21:50:18 2018        
(r327894)
+++ head/sys/arm/arm/machdep_ptrace.c   Fri Jan 12 22:48:23 2018        
(r327895)
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
+#include <sys/lock.h>
 #include <sys/mutex.h>
 
 #include <machine/machdep.h>

Modified: head/sys/compat/freebsd32/freebsd32_misc.c
==============================================================================
--- head/sys/compat/freebsd32/freebsd32_misc.c  Fri Jan 12 21:50:18 2018        
(r327894)
+++ head/sys/compat/freebsd32/freebsd32_misc.c  Fri Jan 12 22:48:23 2018        
(r327895)
@@ -3017,6 +3017,24 @@ freebsd32_cpuset_setaffinity(struct thread *td,
 }
 
 int
+freebsd32_cpuset_getdomain(struct thread *td,
+    struct freebsd32_cpuset_getdomain_args *uap)
+{
+
+       return (kern_cpuset_getdomain(td, uap->level, uap->which,
+           PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, 
uap->policy));
+}
+
+int
+freebsd32_cpuset_setdomain(struct thread *td,
+    struct freebsd32_cpuset_setdomain_args *uap)
+{
+
+       return (kern_cpuset_setdomain(td, uap->level, uap->which,
+           PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, 
uap->policy));
+}
+
+int
 freebsd32_nmount(struct thread *td,
     struct freebsd32_nmount_args /* {
        struct iovec *iovp;

Modified: head/sys/compat/freebsd32/syscalls.master
==============================================================================
--- head/sys/compat/freebsd32/syscalls.master   Fri Jan 12 21:50:18 2018        
(r327894)
+++ head/sys/compat/freebsd32/syscalls.master   Fri Jan 12 22:48:23 2018        
(r327895)
@@ -1086,12 +1086,8 @@
 547    AUE_FUTIMESAT   STD     { int freebsd32_utimensat(int fd, \
                                    char *path, \
                                    struct timespec *times, int flag); }
-548    AUE_NULL        NOPROTO { int numa_getaffinity(cpuwhich_t which, \
-                                   id_t id, \
-                                   struct vm_domain_policy *policy); }
-549    AUE_NULL        NOPROTO { int numa_setaffinity(cpuwhich_t which, \
-                                   id_t id, \
-                                   const struct vm_domain_policy *policy); }
+548    AUE_NULL        UNIMPL  numa_getaffinity
+549    AUE_NULL        UNIMPL  numa_setaffinity
 550    AUE_FSYNC       NOPROTO { int fdatasync(int fd); }
 551    AUE_FSTAT       STD     { int freebsd32_fstat(int fd, \
                                    struct stat32 *ub); }
@@ -1119,4 +1115,13 @@
                                    struct kevent32 *eventlist, \
                                    int nevents, \
                                    const struct timespec32 *timeout); }
+561    AUE_NULL        STD     { int freebsd32_cpuset_getdomain(cpulevel_t 
level, \
+                                   cpuwhich_t which, uint32_t id1, uint32_t 
id2, \
+                                   size_t domainsetsize, domainset_t *mask, \
+                                   int *policy); }
+562    AUE_NULL        STD     { int freebsd32_cpuset_setdomain(cpulevel_t 
level, \
+                                   cpuwhich_t which, uint32_t id1, uint32_t 
id2, \
+                                   size_t domainsetsize, domainset_t *mask, \
+                                   int policy); }
+
 ; vim: syntax=off

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files Fri Jan 12 21:50:18 2018        (r327894)
+++ head/sys/conf/files Fri Jan 12 22:48:23 2018        (r327895)
@@ -3787,7 +3787,6 @@ kern/kern_module.c                standard
 kern/kern_mtxpool.c            standard
 kern/kern_mutex.c              standard
 kern/kern_ntptime.c            standard
-kern/kern_numa.c               standard
 kern/kern_osd.c                        standard
 kern/kern_physio.c             standard
 kern/kern_pmc.c                        standard
@@ -4837,7 +4836,7 @@ vm/swap_pager.c                   standard
 vm/uma_core.c                  standard
 vm/uma_dbg.c                   standard
 vm/memguard.c                  optional DEBUG_MEMGUARD
-vm/vm_domain.c                 standard
+vm/vm_domainset.c              standard
 vm/vm_fault.c                  standard
 vm/vm_glue.c                   standard
 vm/vm_init.c                   standard

Modified: head/sys/ddb/db_run.c
==============================================================================
--- head/sys/ddb/db_run.c       Fri Jan 12 21:50:18 2018        (r327894)
+++ head/sys/ddb/db_run.c       Fri Jan 12 22:48:23 2018        (r327895)
@@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/proc.h>
+#include <sys/systm.h>
 
 #include <machine/kdb.h>
 #include <machine/pcb.h>

Modified: head/sys/kern/init_main.c
==============================================================================
--- head/sys/kern/init_main.c   Fri Jan 12 21:50:18 2018        (r327894)
+++ head/sys/kern/init_main.c   Fri Jan 12 22:48:23 2018        (r327895)
@@ -89,7 +89,6 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
-#include <vm/vm_domain.h>
 #include <sys/copyright.h>
 
 #include <ddb/ddb.h>
@@ -497,10 +496,7 @@ proc0_init(void *dummy __unused)
        td->td_flags = TDF_INMEM;
        td->td_pflags = TDP_KTHREAD;
        td->td_cpuset = cpuset_thread0();
-       vm_domain_policy_init(&td->td_vm_dom_policy);
-       vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1);
-       vm_domain_policy_init(&p->p_vm_dom_policy);
-       vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1);
+       td->td_domain.dr_policy = td->td_cpuset->cs_domain;
        prison0_init();
        p->p_peers = 0;
        p->p_leader = p;

Modified: head/sys/kern/init_sysent.c
==============================================================================
--- head/sys/kern/init_sysent.c Fri Jan 12 21:50:18 2018        (r327894)
+++ head/sys/kern/init_sysent.c Fri Jan 12 22:48:23 2018        (r327895)
@@ -599,8 +599,8 @@ struct sysent sysent[] = {
        { AS(ppoll_args), (sy_call_t *)sys_ppoll, AUE_POLL, NULL, 0, 0, 0, 
SY_THR_STATIC },     /* 545 = ppoll */
        { AS(futimens_args), (sy_call_t *)sys_futimens, AUE_FUTIMES, NULL, 0, 
0, SYF_CAPENABLED, SY_THR_STATIC },       /* 546 = futimens */
        { AS(utimensat_args), (sy_call_t *)sys_utimensat, AUE_FUTIMESAT, NULL, 
0, 0, SYF_CAPENABLED, SY_THR_STATIC },   /* 547 = utimensat */
-       { AS(numa_getaffinity_args), (sy_call_t *)sys_numa_getaffinity, 
AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },       /* 548 = numa_getaffinity */
-       { AS(numa_setaffinity_args), (sy_call_t *)sys_numa_setaffinity, 
AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },       /* 549 = numa_setaffinity */
+       { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },      
                /* 548 = numa_getaffinity */
+       { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },      
                /* 549 = numa_setaffinity */
        { AS(fdatasync_args), (sy_call_t *)sys_fdatasync, AUE_FSYNC, NULL, 0, 
0, 0, SY_THR_STATIC },    /* 550 = fdatasync */
        { AS(fstat_args), (sy_call_t *)sys_fstat, AUE_FSTAT, NULL, 0, 0, 
SYF_CAPENABLED, SY_THR_STATIC },       /* 551 = fstat */
        { AS(fstatat_args), (sy_call_t *)sys_fstatat, AUE_FSTATAT, NULL, 0, 0, 
SYF_CAPENABLED, SY_THR_STATIC }, /* 552 = fstatat */
@@ -612,4 +612,6 @@ struct sysent sysent[] = {
        { AS(fhstatfs_args), (sy_call_t *)sys_fhstatfs, AUE_FHSTATFS, NULL, 0, 
0, 0, SY_THR_STATIC },   /* 558 = fhstatfs */
        { AS(mknodat_args), (sy_call_t *)sys_mknodat, AUE_MKNODAT, NULL, 0, 0, 
SYF_CAPENABLED, SY_THR_STATIC }, /* 559 = mknodat */
        { AS(kevent_args), (sy_call_t *)sys_kevent, AUE_KEVENT, NULL, 0, 0, 
SYF_CAPENABLED, SY_THR_STATIC },    /* 560 = kevent */
+       { AS(cpuset_getdomain_args), (sy_call_t *)sys_cpuset_getdomain, 
AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },       /* 561 = cpuset_getdomain */
+       { AS(cpuset_setdomain_args), (sy_call_t *)sys_cpuset_setdomain, 
AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },       /* 562 = cpuset_setdomain */
 };

Modified: head/sys/kern/kern_cpuset.c
==============================================================================
--- head/sys/kern/kern_cpuset.c Fri Jan 12 21:50:18 2018        (r327894)
+++ head/sys/kern/kern_cpuset.c Fri Jan 12 22:48:23 2018        (r327895)
@@ -51,17 +51,21 @@ __FBSDID("$FreeBSD$");
 #include <sys/syscallsubr.h>
 #include <sys/capsicum.h>
 #include <sys/cpuset.h>
+#include <sys/domainset.h>
 #include <sys/sx.h>
 #include <sys/queue.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
+#include <sys/vmmeter.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
+#include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
+#include <vm/vm_phys.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -109,8 +113,10 @@ __FBSDID("$FreeBSD$");
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
 static uma_zone_t cpuset_zone;
+static uma_zone_t domainset_zone;
 static struct mtx cpuset_lock;
 static struct setlist cpuset_ids;
+static struct domainlist cpuset_domains;
 static struct unrhdr *cpuset_unr;
 static struct cpuset *cpuset_zero, *cpuset_default;
 
@@ -121,7 +127,33 @@ SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_
 cpuset_t *cpuset_root;
 cpuset_t cpuset_domain[MAXMEMDOM];
 
+static int domainset_valid(const struct domainset *, const struct domainset *);
+
 /*
+ * Find the first non-anonymous set starting from 'set'.
+ */
+static struct cpuset *
+cpuset_getbase(struct cpuset *set)
+{
+
+       if (set->cs_id == CPUSET_INVALID)
+               set = set->cs_parent;
+       return (set);
+}
+
+/*
+ * Walks up the tree from 'set' to find the root.
+ */
+static struct cpuset *
+cpuset_getroot(struct cpuset *set)
+{
+
+       while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL)
+               set = set->cs_parent;
+       return (set);
+}
+
+/*
  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
  */
 struct cpuset *
@@ -140,12 +172,7 @@ static struct cpuset *
 cpuset_refroot(struct cpuset *set)
 {
 
-       for (; set->cs_parent != NULL; set = set->cs_parent)
-               if (set->cs_flags & CPU_SET_ROOT)
-                       break;
-       cpuset_ref(set);
-
-       return (set);
+       return (cpuset_ref(cpuset_getroot(set)));
 }
 
 /*
@@ -157,11 +184,7 @@ static struct cpuset *
 cpuset_refbase(struct cpuset *set)
 {
 
-       if (set->cs_id == CPUSET_INVALID)
-               set = set->cs_parent;
-       cpuset_ref(set);
-
-       return (set);
+       return (cpuset_ref(cpuset_getbase(set)));
 }
 
 /*
@@ -257,17 +280,25 @@ cpuset_lookup(cpusetid_t setid, struct thread *td)
  * will have no valid cpu based on restrictions from the parent.
  */
 static int
-_cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
-    cpusetid_t id)
+_cpuset_create(struct cpuset *set, struct cpuset *parent,
+    const cpuset_t *mask, struct domainset *domain, cpusetid_t id)
 {
 
+       if (domain == NULL)
+               domain = parent->cs_domain;
+       if (mask == NULL)
+               mask = &parent->cs_mask;
        if (!CPU_OVERLAP(&parent->cs_mask, mask))
                return (EDEADLK);
+       /* The domain must be prepared ahead of time. */
+       if (!domainset_valid(parent->cs_domain, domain))
+               return (EDEADLK);
        CPU_COPY(mask, &set->cs_mask);
        LIST_INIT(&set->cs_children);
        refcount_init(&set->cs_ref, 1);
        set->cs_flags = 0;
        mtx_lock_spin(&cpuset_lock);
+       set->cs_domain = domain;
        CPU_AND(&set->cs_mask, &parent->cs_mask);
        set->cs_id = id;
        set->cs_parent = cpuset_ref(parent);
@@ -294,8 +325,8 @@ cpuset_create(struct cpuset **setp, struct cpuset *par
        id = alloc_unr(cpuset_unr);
        if (id == -1)
                return (ENFILE);
-       *setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
-       error = _cpuset_create(set, parent, mask, id);
+       *setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
+       error = _cpuset_create(set, parent, mask, NULL, id);
        if (error == 0)
                return (0);
        free_unr(cpuset_unr, id);
@@ -304,7 +335,207 @@ cpuset_create(struct cpuset **setp, struct cpuset *par
        return (error);
 }
 
+static void
+cpuset_freelist_add(struct setlist *list, int count)
+{
+       struct cpuset *set;
+       int i;
+
+       for (i = 0; i < count; i++) {
+               set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK);
+               LIST_INSERT_HEAD(list, set, cs_link);
+       }
+}
+
+static void
+cpuset_freelist_init(struct setlist *list, int count)
+{
+
+       LIST_INIT(list);
+       cpuset_freelist_add(list, count);
+}
+
+static void
+cpuset_freelist_free(struct setlist *list)
+{
+       struct cpuset *set;
+
+       while ((set = LIST_FIRST(list)) != NULL) {
+               LIST_REMOVE(set, cs_link);
+               uma_zfree(cpuset_zone, set);
+       }
+}
+
+static void
+domainset_freelist_add(struct domainlist *list, int count)
+{
+       struct domainset *set;
+       int i;
+
+       for (i = 0; i < count; i++) {
+               set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK);
+               LIST_INSERT_HEAD(list, set, ds_link);
+       }
+}
+
+static void
+domainset_freelist_init(struct domainlist *list, int count)
+{
+
+       LIST_INIT(list);
+       domainset_freelist_add(list, count);
+}
+
+static void
+domainset_freelist_free(struct domainlist *list)
+{
+       struct domainset *set;
+
+       while ((set = LIST_FIRST(list)) != NULL) {
+               LIST_REMOVE(set, ds_link);
+               uma_zfree(domainset_zone, set);
+       }
+}
+
+/* Copy a domainset preserving mask and policy. */
+static void
+domainset_copy(const struct domainset *from, struct domainset *to)
+{
+
+       DOMAINSET_COPY(&from->ds_mask, &to->ds_mask);
+       to->ds_policy = from->ds_policy;
+       to->ds_prefer = from->ds_prefer;
+}
+
+/* Return 1 if mask and policy are equal, otherwise 0. */
+static int
+domainset_equal(const struct domainset *one, const struct domainset *two)
+{
+
+       return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 &&
+           one->ds_policy == two->ds_policy &&
+           one->ds_prefer == two->ds_prefer);
+}
+
+/* Return 1 if child is a valid subset of parent. */
+static int
+domainset_valid(const struct domainset *parent, const struct domainset *child)
+{
+       if (child->ds_policy != DOMAINSET_POLICY_PREFER)
+               return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask));
+       return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
+}
+
+static int
+domainset_restrict(const struct domainset *parent,
+    const struct domainset *child)
+{
+       if (child->ds_policy != DOMAINSET_POLICY_PREFER)
+               return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask));
+       return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
+}
+
 /*
+ * Lookup or create a domainset.  The key is provided in ds_mask and
+ * ds_policy.  If the domainset does not yet exist the storage in
+ * 'domain' is used to insert.  Otherwise this storage is freed to the
+ * domainset_zone and the existing domainset is returned.
+ */
+static struct domainset *
+_domainset_create(struct domainset *domain, struct domainlist *freelist)
+{
+       struct domainset *ndomain;
+
+       mtx_lock_spin(&cpuset_lock);
+       LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
+               if (domainset_equal(ndomain, domain))
+                       break;
+       /*
+        * If the domain does not yet exist we insert it and initialize
+        * various iteration helpers which are not part of the key.
+        */
+       if (ndomain == NULL) {
+               LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
+               domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
+               domain->ds_max = DOMAINSET_FLS(&domain->ds_mask) + 1;
+       }
+       mtx_unlock_spin(&cpuset_lock);
+       if (ndomain == NULL)
+               return (domain);
+       if (freelist != NULL)
+               LIST_INSERT_HEAD(freelist, domain, ds_link);
+       else
+               uma_zfree(domainset_zone, domain);
+       return (ndomain);
+       
+}
+
+/*
+ * Create or lookup a domainset based on the key held in 'domain'.
+ */
+static struct domainset *
+domainset_create(const struct domainset *domain)
+{
+       struct domainset *ndomain;
+
+       ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
+       domainset_copy(domain, ndomain);
+       return _domainset_create(ndomain, NULL);
+}
+
+/*
+ * Update thread domainset pointers.
+ */
+static void
+domainset_notify(void)
+{
+       struct thread *td;
+       struct proc *p;
+
+       sx_slock(&allproc_lock);
+       FOREACH_PROC_IN_SYSTEM(p) {
+               PROC_LOCK(p);
+               if (p->p_state == PRS_NEW) {
+                       PROC_UNLOCK(p);
+                       continue;
+               }
+               FOREACH_THREAD_IN_PROC(p, td) {
+                       thread_lock(td);
+                       td->td_domain.dr_policy = td->td_cpuset->cs_domain;
+                       thread_unlock(td);
+               }
+               PROC_UNLOCK(p);
+       }
+       sx_sunlock(&allproc_lock);
+       kernel_object->domain.dr_policy = cpuset_default->cs_domain;
+}
+
+/*
+ * Create a new set that is a subset of a parent.
+ */
+static struct domainset *
+domainset_shadow(const struct domainset *pdomain,
+    const struct domainset *domain, struct domainlist *freelist)
+{
+       struct domainset *ndomain;
+
+       ndomain = LIST_FIRST(freelist);
+       LIST_REMOVE(ndomain, ds_link);
+
+       /*
+        * Initialize the key from the request.
+        */
+       domainset_copy(domain, ndomain);
+
+       /*
+        * Restrict the key by the parent.
+        */
+       DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask);
+
+       return _domainset_create(ndomain, freelist);
+}
+
+/*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
@@ -376,10 +607,12 @@ cpuset_modify(struct cpuset *set, cpuset_t *mask)
         * Verify that we have access to this set of
         * cpus.
         */
-       root = set->cs_parent;
-       if (root && !CPU_SUBSET(&root->cs_mask, mask))
-               return (EINVAL);
+       root = cpuset_getroot(set);
        mtx_lock_spin(&cpuset_lock);
+       if (root && !CPU_SUBSET(&root->cs_mask, mask)) {
+               error = EINVAL;
+               goto out;
+       }
        error = cpuset_testupdate(set, mask, 0);
        if (error)
                goto out;
@@ -392,6 +625,141 @@ out:
 }
 
 /*
+ * Recursively check for errors that would occur from applying mask to
+ * the tree of sets starting at 'set'.  Checks for sets that would become
+ * empty as well as RDONLY flags.
+ */
+static int
+cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset,
+    struct domainset *orig, int *count, int check_mask)
+{
+       struct cpuset *nset;
+       struct domainset *domain;
+       struct domainset newset;
+       int error;
+
+       mtx_assert(&cpuset_lock, MA_OWNED);
+       if (set->cs_flags & CPU_SET_RDONLY)
+               return (EPERM);
+       domain = set->cs_domain;
+       domainset_copy(domain, &newset);
+       if (!domainset_equal(domain, orig)) {
+               if (!domainset_restrict(domain, dset))
+                       return (EDEADLK);
+               DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask);
+               /* Count the number of domains that are changing. */
+               (*count)++;
+       }
+       error = 0;
+       LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
+               if ((error = cpuset_testupdate_domain(nset, &newset, domain,
+                   count, 1)) != 0)
+                       break;
+       return (error);
+}
+
+/*
+ * Applies the mask 'mask' without checking for empty sets or permissions.
+ */
+static void
+cpuset_update_domain(struct cpuset *set, struct domainset *domain,
+    struct domainset *orig, struct domainlist *domains)
+{
+       struct cpuset *nset;
+
+       mtx_assert(&cpuset_lock, MA_OWNED);
+       /*
+        * If this domainset has changed from the parent we must calculate
+        * a new set.  Otherwise it simply inherits from the parent.  When
+        * we inherit from the parent we get a new mask and policy.  If the
+        * set is modified from the parent we keep the policy and only
+        * update the mask.
+        */
+       if (set->cs_domain != orig) {
+               orig = set->cs_domain;
+               set->cs_domain = domainset_shadow(domain, orig, domains);
+       } else
+               set->cs_domain = domain;
+       LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
+               cpuset_update_domain(nset, set->cs_domain, orig, domains);
+
+       return;
+}
+
+/*
+ * Modify the set 'set' to use a copy the domainset provided.  Apply this new
+ * mask to restrict all children in the tree.  Checks for validity before
+ * applying the changes.
+ */
+static int
+cpuset_modify_domain(struct cpuset *set, struct domainset *domain)
+{
+       struct domainlist domains;
+       struct domainset temp;
+       struct domainset *dset;
+       struct cpuset *root;
+       int ndomains, needed;
+       int error;
+
+       error = priv_check(curthread, PRIV_SCHED_CPUSET);
+       if (error)
+               return (error);
+       /*
+        * In case we are called from within the jail
+        * we do not allow modifying the dedicated root
+        * cpuset of the jail but may still allow to
+        * change child sets.
+        */
+       if (jailed(curthread->td_ucred) &&
+           set->cs_flags & CPU_SET_ROOT)
+               return (EPERM);
+       domainset_freelist_init(&domains, 0);
+       domain = domainset_create(domain);
+       ndomains = needed = 0;
+       do {
+               if (ndomains < needed) {
+                       domainset_freelist_add(&domains, needed - ndomains);
+                       ndomains = needed;
+               }
+               root = cpuset_getroot(set);
+               mtx_lock_spin(&cpuset_lock);
+               dset = root->cs_domain;
+               /*
+                * Verify that we have access to this set of domains.
+                */
+               if (root && !domainset_valid(dset, domain)) {
+                       error = EINVAL;
+                       goto out;
+               }
+               /*
+                * If applying prefer we keep the current set as the fallback.
+                */
+               if (domain->ds_policy == DOMAINSET_POLICY_PREFER)
+                       DOMAINSET_COPY(&set->cs_domain->ds_mask,
+                           &domain->ds_mask);
+               /*
+                * Determine whether we can apply this set of domains and
+                * how many new domain structures it will require.
+                */
+               domainset_copy(domain, &temp);
+               needed = 0;
+               error = cpuset_testupdate_domain(set, &temp, set->cs_domain,
+                   &needed, 0);
+               if (error)
+                       goto out;
+       } while (ndomains < needed);
+       dset = set->cs_domain;
+       cpuset_update_domain(set, domain, dset, &domains);
+out:
+       mtx_unlock_spin(&cpuset_lock);
+       domainset_freelist_free(&domains);
+       if (error == 0)
+               domainset_notify();
+
+       return (error);
+}
+
+/*
  * Resolve the 'which' parameter of several cpuset apis.
  *
  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
@@ -481,44 +849,203 @@ cpuset_which(cpuwhich_t which, id_t id, struct proc **
        return (0);
 }
 
+static int
+cpuset_testshadow(struct cpuset *set, const cpuset_t *mask,
+    const struct domainset *domain)
+{
+       struct cpuset *parent;
+       struct domainset *dset;
+
+       parent = cpuset_getbase(set);
+       /*
+        * If we are restricting a cpu mask it must be a subset of the
+        * parent or invalid CPUs have been specified.
+        */
+       if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask))
+               return (EINVAL);
+
+       /*
+        * If we are restricting a domain mask it must be a subset of the
+        * parent or invalid domains have been specified.
+        */
+       dset = parent->cs_domain;
+       if (domain != NULL && !domainset_valid(dset, domain))
+               return (EINVAL);
+
+       return (0);
+}
+
 /*
  * Create an anonymous set with the provided mask in the space provided by
- * 'fset'.  If the passed in set is anonymous we use its parent otherwise
+ * 'nset'.  If the passed in set is anonymous we use its parent otherwise
  * the new set is a child of 'set'.
  */
 static int
-cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
+cpuset_shadow(struct cpuset *set, struct cpuset **nsetp,
+   const cpuset_t *mask, const struct domainset *domain,
+   struct setlist *cpusets, struct domainlist *domains)
 {
        struct cpuset *parent;
+       struct cpuset *nset;
+       struct domainset *dset;
+       struct domainset *d;
+       int error;
 
-       if (set->cs_id == CPUSET_INVALID)
-               parent = set->cs_parent;
+       error = cpuset_testshadow(set, mask, domain);
+       if (error)
+               return (error);
+
+       parent = cpuset_getbase(set);
+       dset = parent->cs_domain;
+       if (mask == NULL)
+               mask = &set->cs_mask;
+       if (domain != NULL)
+               d = domainset_shadow(dset, domain, domains);
        else
-               parent = set;
-       if (!CPU_SUBSET(&parent->cs_mask, mask))
+               d = set->cs_domain;
+       nset = LIST_FIRST(cpusets);
+       error = _cpuset_create(nset, parent, mask, d, CPUSET_INVALID);
+       if (error == 0) {
+               LIST_REMOVE(nset, cs_link);
+               *nsetp = nset;
+       }
+       return (error);
+}
+
+static struct cpuset *
+cpuset_update_thread(struct thread *td, struct cpuset *nset)
+{
+       struct cpuset *tdset;
+
+       tdset = td->td_cpuset;
+       td->td_cpuset = nset;
+       td->td_domain.dr_policy = nset->cs_domain;
+       sched_affinity(td);
+
+       return (tdset);
+}
+
+static int
+cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask,
+    struct domainset *domain)
+{
+       struct cpuset *parent;
+
+       parent = cpuset_getbase(tdset);
+       if (mask == NULL)
+               mask = &tdset->cs_mask;
+       if (domain == NULL)
+               domain = tdset->cs_domain;
+       return cpuset_testshadow(parent, mask, domain);
+}
+
+static int
+cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask,
+    struct domainset *domain, struct cpuset **nsetp,
+    struct setlist *freelist, struct domainlist *domainlist)
+{
+       struct cpuset *parent;
+
+       parent = cpuset_getbase(tdset);
+       if (mask == NULL)
+               mask = &tdset->cs_mask;
+       if (domain == NULL)
+               domain = tdset->cs_domain;
+       return cpuset_shadow(parent, nsetp, mask, domain, freelist,
+           domainlist);
+}
+
+static int
+cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set,
+    cpuset_t *mask, struct domainset *domain)
+{
+       struct cpuset *parent;
+
+       parent = cpuset_getbase(tdset);
+
+       /*
+        * If the thread restricted its mask then apply that same
+        * restriction to the new set, otherwise take it wholesale.
+        */
+       if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) {
+               CPU_COPY(&tdset->cs_mask, mask);
+               CPU_AND(mask, &set->cs_mask);
+       } else
+               CPU_COPY(&set->cs_mask, mask);
+
+       /*
+        * If the thread restricted the domain then we apply the
+        * restriction to the new set but retain the policy.
+        */
+       if (tdset->cs_domain != parent->cs_domain) {
+               domainset_copy(tdset->cs_domain, domain);
+               DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask);
+       } else
+               domainset_copy(set->cs_domain, domain);
+
+       if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask))
                return (EDEADLK);
-       return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
+
+       return (0);
 }
 
+static int
+cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set)
+{
+       struct domainset domain;
+       cpuset_t mask;
+
+       if (tdset->cs_id != CPUSET_INVALID)
+               return (0);
+       return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
+}
+
+static int
+cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
+    struct cpuset **nsetp, struct setlist *freelist,
+    struct domainlist *domainlist)
+{
+       struct domainset domain;
+       cpuset_t mask;
+       int error;
+
+       /*
+        * If we're replacing on a thread that has not constrained the
+        * original set we can simply accept the new set.
+        */
+       if (tdset->cs_id != CPUSET_INVALID) {
+               *nsetp = cpuset_ref(set);
+               return (0);
+       }
+       error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
+       if (error)
+               return (error);
+
+       return cpuset_shadow(tdset, nsetp, &mask, &domain, freelist,
+           domainlist);
+}
+
 /*
- * Handle two cases for replacing the base set or mask of an entire process.
+ * Handle three cases for updating an entire process.
  *
- * 1) Set is non-null and mask is null.  This reparents all anonymous sets
- *    to the provided set and replaces all non-anonymous td_cpusets with the
- *    provided set.
- * 2) Mask is non-null and set is null.  This replaces or creates anonymous
- *    sets for every thread with the existing base as a parent.
+ * 1) Set is non-null.  This reparents all anonymous sets to the provided
+ *    set and replaces all non-anonymous td_cpusets with the provided set.
+ * 2) Mask is non-null.  This replaces or creates anonymous sets for every
+ *    thread with the existing base as a parent.
+ * 3) domain is non-null.  This creates anonymous sets for every thread
+ *    and replaces the domain set.
  *
  * This is overly complicated because we can't allocate while holding a 
  * spinlock and spinlocks must be held while changing and examining thread
  * state.
  */
 static int
-cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
+cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
+    struct domainset *domain)
 {
        struct setlist freelist;
        struct setlist droplist;
-       struct cpuset *tdset;
+       struct domainlist domainlist;
        struct cpuset *nset;
        struct thread *td;
        struct proc *p;
@@ -533,7 +1060,9 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t
         * 2) If enough cpusets have not been allocated release the locks and
         *    allocate them.  Loop.
         */
-       LIST_INIT(&freelist);
+       cpuset_freelist_init(&freelist, 1);
+       domainset_freelist_init(&domainlist, 1);
+       nfree = 1;
        LIST_INIT(&droplist);
        nfree = 0;
        for (;;) {
@@ -544,39 +1073,27 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t
                        break;
                threads = p->p_numthreads;
                PROC_UNLOCK(p);
-               for (; nfree < threads; nfree++) {
-                       nset = uma_zalloc(cpuset_zone, M_WAITOK);
-                       LIST_INSERT_HEAD(&freelist, nset, cs_link);
+               if (nfree < threads) {
+                       cpuset_freelist_add(&freelist, threads - nfree);
+                       domainset_freelist_add(&domainlist, threads - nfree);
+                       nfree = threads;
                }
        }
        PROC_LOCK_ASSERT(p, MA_OWNED);
        /*
         * Now that the appropriate locks are held and we have enough cpusets,
-        * make sure the operation will succeed before applying changes.  The
+        * make sure the operation will succeed before applying changes. The
         * proc lock prevents td_cpuset from changing between calls.
         */
        error = 0;
        FOREACH_THREAD_IN_PROC(p, td) {
                thread_lock(td);
-               tdset = td->td_cpuset;
-               /*
-                * Verify that a new mask doesn't specify cpus outside of
-                * the set the thread is a member of.
-                */
-               if (mask) {
-                       if (tdset->cs_id == CPUSET_INVALID)
-                               tdset = tdset->cs_parent;
-                       if (!CPU_SUBSET(&tdset->cs_mask, mask))
-                               error = EDEADLK;
-               /*
-                * Verify that a new set won't leave an existing thread
-                * mask without a cpu to run on.  It can, however, restrict
-                * the set.
-                */
-               } else if (tdset->cs_id == CPUSET_INVALID) {
-                       if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
-                               error = EDEADLK;
-               }
+               if (set != NULL)
+                       error = cpuset_setproc_test_setthread(td->td_cpuset,
+                           set);
+               else
+                       error = cpuset_setproc_test_maskthread(td->td_cpuset,
+                           mask, domain);
                thread_unlock(td);
                if (error)
                        goto unlock_out;
@@ -588,33 +1105,17 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t
         */
        FOREACH_THREAD_IN_PROC(p, td) {
                thread_lock(td);
-               /*
-                * If we presently have an anonymous set or are applying a
-                * mask we must create an anonymous shadow set.  That is
-                * either parented to our existing base or the supplied set.
-                *
-                * If we have a base set with no anonymous shadow we simply
-                * replace it outright.
-                */
-               tdset = td->td_cpuset;
-               if (tdset->cs_id == CPUSET_INVALID || mask) {
-                       nset = LIST_FIRST(&freelist);
-                       LIST_REMOVE(nset, cs_link);
-                       if (mask)
-                               error = cpuset_shadow(tdset, nset, mask);
-                       else
-                               error = _cpuset_create(nset, set,
-                                   &tdset->cs_mask, CPUSET_INVALID);
-                       if (error) {
-                               LIST_INSERT_HEAD(&freelist, nset, cs_link);
-                               thread_unlock(td);
-                               break;
-                       }
-               } else
-                       nset = cpuset_ref(set);
-               cpuset_rel_defer(&droplist, tdset);
-               td->td_cpuset = nset;
-               sched_affinity(td);
+               if (set != NULL)
+                       error = cpuset_setproc_setthread(td->td_cpuset, set,
+                           &nset, &freelist, &domainlist);
+               else
+                       error = cpuset_setproc_maskthread(td->td_cpuset, mask,
+                           domain, &nset, &freelist, &domainlist);
+               if (error) {
+                       thread_unlock(td);
+                       break;
+               }
+               cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset));
                thread_unlock(td);
        }
 unlock_out:
@@ -622,10 +1123,8 @@ unlock_out:
 out:
        while ((nset = LIST_FIRST(&droplist)) != NULL)
                cpuset_rel_complete(nset);
-       while ((nset = LIST_FIRST(&freelist)) != NULL) {
-               LIST_REMOVE(nset, cs_link);
-               uma_zfree(cpuset_zone, nset);
-       }
+       cpuset_freelist_free(&freelist);
+       domainset_freelist_free(&domainlist);
        return (error);
 }
 
@@ -690,46 +1189,57 @@ cpusetobj_strscan(cpuset_t *set, const char *buf)

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to