On Thu, Feb 27 2014, Jens Axboe wrote:
> On 2014-02-26 17:12, Elliott, Robert (Server Storage) wrote:
> >>-----Original Message-----
> >>From: Jens Axboe [mailto:[email protected]]
> >>Sent: Wednesday, 26 February, 2014 6:08 PM
> >>To: Elliott, Robert (Server Storage); [email protected]
> >>Subject: Re: cpus_allowed per thread behavior
> >>
> >>On 2014-02-26 15:54, Elliott, Robert (Server Storage) wrote:
> >>>fio seems to assign the same cpus_allowed/cpumask value to all threads.
> >> > I think this allows the OS to move the threads around those CPUs.
> >>
> >>Correct. As long as the number of cpus in the mask is equal to (or
> >>larger than) the number of jobs within that group, the OS is free to
> >>place them wherever it wants. In practice, unless the CPU scheduling is
> >>horribly broken, they tend to "stick" for most intents and purposes.
> >>
> >>>In comparison, iometer assigns its worker threads to specific CPUs
> >> > within the cpumask in round-robin manner. Would that be worth adding
> >> > to fio, perhaps with an option like cpus_allowed_policy=roundrobin?
> >>
> >>Sure, we could add that feature. You can get the same setup now, if you
> >>"unroll" the job section, but that might not always be practical. How
> >>about cpus_allowed_policy, with 'shared' being the existing (and
> >>default) behavior and 'split' being each thread grabbing one of the CPUs?
> >
> >Perhaps NUMA and hyperthreading aware allocation policies would
> >also be useful?
> >
> >I don't know how consistent hyperthread CPU numbering is across
> >systems. On some servers I've tried, linux assigns 0-5 to the main
> >cores and 6-11 to the hyperthreaded siblings, while Windows assigns
> >0,2,4,6,8,10 to the main cores and 1,3,5,7,9,11 to their
> >hyperthreaded siblings.
>
> Linux follows the firmware on that, at least as far as I know. I've
> seen machines renumber when getting a new firmware, going from the
> second scheme you list to the first. But for the below, we cannot
> assume any of them, on some machines you also have > 2 threads per
> core. So the topology would have to be queried.
Here's a test patch that implements the shared/split policy.
diff --git a/HOWTO b/HOWTO
index 4dacd98965ea..040b8a8949c6 100644
--- a/HOWTO
+++ b/HOWTO
@@ -928,6 +928,17 @@ cpus_allowed=str Controls the same options as cpumask, but
it allows a text
allows a range of CPUs. Say you wanted a binding to CPUs
1, 5, and 8-15, you would set cpus_allowed=1,5,8-15.
+cpus_allowed_policy=str Set the policy of how fio distributes the CPUs
+ specified by cpus_allowed or cpumask. Two policies are
+ supported:
+
+ shared All jobs will share the CPU set specified.
+ split Each job will get a unique CPU from the CPU set.
+
+ If shared is the default behaviour, if the option isn't
+ specified. If split is specified, then fio will error out if
+ there are more jobs defined than CPUs given in the set.
+
numa_cpu_nodes=str Set this job running on spcified NUMA nodes' CPUs. The
arguments allow comma delimited list of cpu numbers,
A-B ranges, or 'all'. Note, to enable numa options support,
diff --git a/backend.c b/backend.c
index ee395bd0ea57..12c76d8545ef 100644
--- a/backend.c
+++ b/backend.c
@@ -1278,6 +1278,15 @@ static void *thread_main(void *data)
* allocations.
*/
if (o->cpumask_set) {
+ if (o->cpus_allowed_policy == FIO_CPUS_SPLIT) {
+ ret = fio_cpus_split(&o->cpumask, td->thread_number);
+ if (!ret) {
+ log_err("fio: no CPUs set\n");
+ log_err("fio: Try increasing number of
available CPUs\n");
+ td_verror(td, EINVAL, "cpus_split");
+ goto err;
+ }
+ }
ret = fio_setaffinity(td->pid, o->cpumask);
if (ret == -1) {
td_verror(td, errno, "cpu_set_affinity");
diff --git a/cconv.c b/cconv.c
index fd8d0ad85142..357a7845e559 100644
--- a/cconv.c
+++ b/cconv.c
@@ -188,6 +188,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
o->numjobs = le32_to_cpu(top->numjobs);
o->cpumask_set = le32_to_cpu(top->cpumask_set);
o->verify_cpumask_set = le32_to_cpu(top->verify_cpumask_set);
+ o->cpus_allowed_policy = le32_to_cpu(top->cpus_allowed_policy);
o->iolog = le32_to_cpu(top->iolog);
o->rwmixcycle = le32_to_cpu(top->rwmixcycle);
o->nice = le32_to_cpu(top->nice);
@@ -343,6 +344,7 @@ void convert_thread_options_to_net(struct
thread_options_pack *top,
top->numjobs = cpu_to_le32(o->numjobs);
top->cpumask_set = cpu_to_le32(o->cpumask_set);
top->verify_cpumask_set = cpu_to_le32(o->verify_cpumask_set);
+ top->cpus_allowed_policy = cpu_to_le32(o->cpus_allowed_policy);
top->iolog = cpu_to_le32(o->iolog);
top->rwmixcycle = cpu_to_le32(o->rwmixcycle);
top->nice = cpu_to_le32(o->nice);
diff --git a/fio.1 b/fio.1
index c530d8440cd0..294e3836b4e5 100644
--- a/fio.1
+++ b/fio.1
@@ -833,6 +833,23 @@ may run on. See \fBsched_setaffinity\fR\|(2).
.BI cpus_allowed \fR=\fPstr
Same as \fBcpumask\fR, but allows a comma-delimited list of CPU numbers.
.TP
+.BI cpus_allowed_policy \fR=\fPstr
+Set the policy of how fio distributes the CPUs specified by \fBcpus_allowed\fR
+or \fBcpumask\fR. Two policies are supported:
+.RS
+.RS
+.TP
+.B shared
+All jobs will share the CPU set specified.
+.TP
+.B split
+Each job will get a unique CPU from the CPU set.
+.RE
+.P
+If \fBshared\fR is the default behaviour, if the option isn't specified. If
+\fBsplit\fR is specified, then fio will error out if there are more jobs
+defined than CPUs given in the set.
+.TP
.BI numa_cpu_nodes \fR=\fPstr
Set this job running on specified NUMA nodes' CPUs. The arguments allow
comma delimited list of cpu numbers, A-B ranges, or 'all'.
diff --git a/fio.h b/fio.h
index 9159b0c2de3e..6f5f29fb3a97 100644
--- a/fio.h
+++ b/fio.h
@@ -629,4 +629,9 @@ enum {
FIO_RAND_GEN_LFSR,
};
+enum {
+ FIO_CPUS_SHARED = 0,
+ FIO_CPUS_SPLIT,
+};
+
#endif
diff --git a/options.c b/options.c
index 6d3956e307bf..c1a8f323e956 100644
--- a/options.c
+++ b/options.c
@@ -394,6 +394,21 @@ static int str_exitall_cb(void)
}
#ifdef FIO_HAVE_CPU_AFFINITY
+int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu)
+{
+ const long max_cpu = cpus_online();
+ unsigned int i;
+
+ for (i = 0; i < max_cpu; i++) {
+ if (cpu != i) {
+ fio_cpu_clear(mask, i);
+ continue;
+ }
+ }
+
+ return fio_cpu_count(mask);
+}
+
static int str_cpumask_cb(void *data, unsigned long long *val)
{
struct thread_data *td = data;
@@ -2875,6 +2890,27 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.category = FIO_OPT_C_GENERAL,
.group = FIO_OPT_G_CRED,
},
+ {
+ .name = "cpus_allowed_policy",
+ .lname = "CPUs allowed distribution policy",
+ .type = FIO_OPT_STR,
+ .off1 = td_var_offset(cpus_allowed_policy),
+ .help = "Distribution policy for cpus_allowed",
+ .parent = "cpus_allowed",
+ .prio = 1,
+ .posval = {
+ { .ival = "shared",
+ .oval = FIO_CPUS_SHARED,
+ .help = "Mask shared between threads",
+ },
+ { .ival = "split",
+ .oval = FIO_CPUS_SPLIT,
+ .help = "Mask split between threads",
+ },
+ },
+ .category = FIO_OPT_C_GENERAL,
+ .group = FIO_OPT_G_CRED,
+ },
#endif
#ifdef CONFIG_LIBNUMA
{
diff --git a/os/os-freebsd.h b/os/os-freebsd.h
index 57ce409c67fd..402792a0f7d7 100644
--- a/os/os-freebsd.h
+++ b/os/os-freebsd.h
@@ -32,6 +32,7 @@ typedef cpuset_t os_cpu_mask_t;
#define fio_cpu_clear(mask, cpu) (void) CPU_CLR((cpu), (mask))
#define fio_cpu_set(mask, cpu) (void) CPU_SET((cpu), (mask))
+#define fio_cpu_count(maks) CPU_COUNT((mask))
static inline int fio_cpuset_init(os_cpu_mask_t *mask)
{
diff --git a/os/os-linux.h b/os/os-linux.h
index 5d1d62db27a0..3ed8c2ef31f2 100644
--- a/os/os-linux.h
+++ b/os/os-linux.h
@@ -61,6 +61,7 @@ typedef struct drand48_data os_random_state_t;
#define fio_cpu_clear(mask, cpu) (void) CPU_CLR((cpu), (mask))
#define fio_cpu_set(mask, cpu) (void) CPU_SET((cpu), (mask))
+#define fio_cpu_count(maks) CPU_COUNT((mask))
static inline int fio_cpuset_init(os_cpu_mask_t *mask)
{
diff --git a/os/os-solaris.h b/os/os-solaris.h
index e6612118ace4..7a0a3f0bfeca 100644
--- a/os/os-solaris.h
+++ b/os/os-solaris.h
@@ -111,6 +111,16 @@ static inline int fio_cpuset_init(os_cpu_mask_t *mask)
return 0;
}
+static inline int fio_cpuset_count(os_cpu_mask_t *mask)
+{
+ unsigned int num_cpus;
+
+ if (pset_info(*mask, NULL, &num_cpus, NULL) < 0)
+ return 0;
+
+ return num_cpus;
+}
+
static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
{
if (pset_destroy(*mask) < 0)
diff --git a/os/os-windows.h b/os/os-windows.h
index de120b64ff7e..7bfe3d2255e4 100644
--- a/os/os-windows.h
+++ b/os/os-windows.h
@@ -214,6 +214,11 @@ static inline void fio_cpu_set(os_cpu_mask_t *mask, int
cpu)
*mask |= 1 << cpu;
}
+static inline int fio_cpu_count(os_cpu_mask_t *mask, int cpu)
+{
+ return hweight64(*mask);
+}
+
static inline int fio_cpuset_init(os_cpu_mask_t *mask)
{
*mask = 0;
diff --git a/os/os.h b/os/os.h
index 03d1e9a14565..a6bc17f09b57 100644
--- a/os/os.h
+++ b/os/os.h
@@ -80,7 +80,10 @@ typedef struct aiocb os_aiocb_t;
#define fio_getaffinity(pid, mask) do { } while (0)
#define fio_cpu_clear(mask, cpu) do { } while (0)
#define fio_cpuset_exit(mask) (-1)
+#define fio_cpus_split(mask, cpu) (0)
typedef unsigned long os_cpu_mask_t;
+#else
+extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
#endif
#ifndef FIO_HAVE_IOPRIO
diff --git a/thread_options.h b/thread_options.h
index 14a4e54abcc7..4ea6ebd06a0c 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -155,6 +155,7 @@ struct thread_options {
unsigned int cpumask_set;
os_cpu_mask_t verify_cpumask;
unsigned int verify_cpumask_set;
+ unsigned int cpus_allowed_policy;
#ifdef CONFIG_LIBNUMA
struct bitmask *numa_cpunodesmask;
unsigned int numa_cpumask_set;
@@ -378,6 +379,7 @@ struct thread_options_pack {
uint32_t cpumask_set;
uint8_t verify_cpumask[FIO_TOP_STR_MAX];
uint32_t verify_cpumask_set;
+ uint32_t cpus_allowed_policy;
uint32_t iolog;
uint32_t rwmixcycle;
uint32_t rwmix[DDIR_RWDIR_CNT];
--
Jens Axboe
--
To unsubscribe from this list: send the line "unsubscribe fio" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html