Re: [Xen-devel] [PATCH v5 1/4] xen/rcu: don't use stop_machine_run() for rcu_barrier()

2020-03-13 Thread Jan Beulich
On 13.03.2020 12:18, Jürgen Groß wrote:
> On 13.03.20 12:02, Julien Grall wrote:
>> On 12/03/2020 08:28, Juergen Gross wrote:
>>> +    for ( ;; )
>>> +    {
>>> +    if ( !atomic_read(_count) && get_cpu_maps() )
>>> +    {
>>> +    n_cpus = num_online_cpus();
>>> +
>>> +    if ( atomic_cmpxchg(_count, 0, n_cpus + 1) == 0 )
>>> +    break;
>>> +
>>> +    put_cpu_maps();
>>> +    }
>>> +
>>> +    process_pending_softirqs();
>>> +    cpu_relax();
>>> +    }
>>> +
>>> +    atomic_set(_count, n_cpus);
>>> +    cpumask_raise_softirq(_online_map, RCU_SOFTIRQ);
>>> +
>>> +    put_cpu_maps();
>>
>> If you put the CPU maps, wouldn't it be possible to have a CPU turned 
>> off? If not, can you add a comment in the code why this is safe?
> 
> Yes, you are right. This might be possible, even if rather
> unlikely as a cpu being removed has to be in idle already, so
> the pending softirq should be picked up rather fast.

I think that's not the main aspect. The CPU to be removed may
already be spinning in cpu_hotplug_begin() (and may in particular
also already be past the rcu_barrier() that Igor's patch is going
to put there).

Jan

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 1/4] xen/rcu: don't use stop_machine_run() for rcu_barrier()

2020-03-13 Thread Jürgen Groß

On 13.03.20 12:02, Julien Grall wrote:



On 12/03/2020 08:28, Juergen Gross wrote:

Today rcu_barrier() is calling stop_machine_run() to synchronize all
physical cpus in order to ensure all pending rcu calls have finished
when returning.

As stop_machine_run() is using tasklets this requires scheduling of
idle vcpus on all cpus imposing the need to call rcu_barrier() on idle
cpus only in case of core scheduling being active, as otherwise a
scheduling deadlock would occur.

There is no need at all to do the syncing of the cpus in tasklets, as
rcu activity is started in __do_softirq() called whenever softirq
activity is allowed. So rcu_barrier() can easily be modified to use
softirq for synchronization of the cpus no longer requiring any
scheduling activity.

As there already is a rcu softirq reuse that for the synchronization.

Remove the barrier element from struct rcu_data as it isn't used.

Finally switch rcu_barrier() to return void as it now can never fail.

Partially-based-on-patch-by: Igor Druzhinin 
Signed-off-by: Juergen Gross 
---
V2:
- add recursion detection

V3:
- fix races (Igor Druzhinin)

V5:
- rename done_count to pending_count (Jan Beulich)
- fix race (Jan Beulich)
---
  xen/common/rcupdate.c  | 92 
--

  xen/include/xen/rcupdate.h |  2 +-
  2 files changed, 66 insertions(+), 28 deletions(-)

diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
index 03d84764d2..c5ef6acb1e 100644
--- a/xen/common/rcupdate.c
+++ b/xen/common/rcupdate.c
@@ -83,7 +83,6 @@ struct rcu_data {
  struct rcu_head **donetail;
  long    blimit;   /* Upper limit on a processed 
batch */

  int cpu;
-    struct rcu_head barrier;
  long    last_rs_qlen; /* qlen during the last 
resched */

  /* 3) idle CPUs handling */
@@ -91,6 +90,7 @@ struct rcu_data {
  bool idle_timer_active;
  bool    process_callbacks;
+    bool    barrier_active;
  };
  /*
@@ -143,51 +143,82 @@ static int qhimark = 1;
  static int qlowmark = 100;
  static int rsinterval = 1000;
-struct rcu_barrier_data {
-    struct rcu_head head;
-    atomic_t *cpu_count;
-};
+/*
+ * rcu_barrier() handling:
+ * cpu_count holds the number of cpu required to finish barrier 
handling.


NIT: the number of cpus (I think)


+ * pending_count is initialized to nr_cpus + 1.
+ * Cpus are synchronized via softirq mechanism. rcu_barrier() is 
regarded to
+ * be active if pending_count is not zero. In case rcu_barrier() is 
called on
+ * multiple cpus it is enough to check for pending_count being not 
zero on entry
+ * and to call process_pending_softirqs() in a loop until 
pending_count drops to

+ * zero, before starting the new rcu_barrier() processing.
+ * In order to avoid hangs when rcu_barrier() is called multiple 
times on the

+ * same cpu in fast sequence and a slave cpu couldn't drop out of the
+ * barrier handling fast enough a second counter pending_count is 
needed.


As an aside question, don't we miss a memory barrier in 
rcu_barrier_callback or rcu_barrier_action()? This barrier would ensure 
that the RCU changes have been seen before we tell the "master" CPU we 
are done.


Sounds like a sensible idea.



+ * The rcu_barrier() invoking cpu will wait until pending_count 
reaches 1
+ * (meaning that all cpus have finished processing the barrier) and 
then will

+ * reset pending_count to 0 to enable entering rcu_barrier() again.
+ */
+static atomic_t cpu_count = ATOMIC_INIT(0);
+static atomic_t pending_count = ATOMIC_INIT(0);
  static void rcu_barrier_callback(struct rcu_head *head)
  {
-    struct rcu_barrier_data *data = container_of(
-    head, struct rcu_barrier_data, head);
-    atomic_inc(data->cpu_count);
+    atomic_dec(_count);
  }
-static int rcu_barrier_action(void *_cpu_count)
+static void rcu_barrier_action(void)
  {
-    struct rcu_barrier_data data = { .cpu_count = _cpu_count };
-
-    ASSERT(!local_irq_is_enabled());
-    local_irq_enable();
+    struct rcu_head head;
  /*
   * When callback is executed, all previously-queued RCU work on 
this CPU
- * is completed. When all CPUs have executed their callback, 
data.cpu_count

- * will have been incremented to include every online CPU.
+ * is completed. When all CPUs have executed their callback, 
cpu_count

+ * will have been decremented to 0.
   */
-    call_rcu(, rcu_barrier_callback);
+    call_rcu(, rcu_barrier_callback);
-    while ( atomic_read(data.cpu_count) != num_online_cpus() )
+    while ( atomic_read(_count) )
  {
  process_pending_softirqs();
  cpu_relax();
  }
-    local_irq_disable();
-
-    return 0;
+    atomic_dec(_count);
  }
-/*
- * As rcu_barrier() is using stop_machine_run() it is allowed to be 
used in

- * idle context only (see comment for stop_machine_run()).
- */
-int rcu_barrier(void)
+void rcu_barrier(void)
  {
-    atomic_t cpu_count = ATOMIC_INIT(0);
-    return 

Re: [Xen-devel] [PATCH v5 1/4] xen/rcu: don't use stop_machine_run() for rcu_barrier()

2020-03-13 Thread Julien Grall



On 12/03/2020 08:28, Juergen Gross wrote:

Today rcu_barrier() is calling stop_machine_run() to synchronize all
physical cpus in order to ensure all pending rcu calls have finished
when returning.

As stop_machine_run() is using tasklets this requires scheduling of
idle vcpus on all cpus imposing the need to call rcu_barrier() on idle
cpus only in case of core scheduling being active, as otherwise a
scheduling deadlock would occur.

There is no need at all to do the syncing of the cpus in tasklets, as
rcu activity is started in __do_softirq() called whenever softirq
activity is allowed. So rcu_barrier() can easily be modified to use
softirq for synchronization of the cpus no longer requiring any
scheduling activity.

As there already is a rcu softirq reuse that for the synchronization.

Remove the barrier element from struct rcu_data as it isn't used.

Finally switch rcu_barrier() to return void as it now can never fail.

Partially-based-on-patch-by: Igor Druzhinin 
Signed-off-by: Juergen Gross 
---
V2:
- add recursion detection

V3:
- fix races (Igor Druzhinin)

V5:
- rename done_count to pending_count (Jan Beulich)
- fix race (Jan Beulich)
---
  xen/common/rcupdate.c  | 92 --
  xen/include/xen/rcupdate.h |  2 +-
  2 files changed, 66 insertions(+), 28 deletions(-)

diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
index 03d84764d2..c5ef6acb1e 100644
--- a/xen/common/rcupdate.c
+++ b/xen/common/rcupdate.c
@@ -83,7 +83,6 @@ struct rcu_data {
  struct rcu_head **donetail;
  longblimit;   /* Upper limit on a processed batch */
  int cpu;
-struct rcu_head barrier;
  longlast_rs_qlen; /* qlen during the last resched */
  
  /* 3) idle CPUs handling */

@@ -91,6 +90,7 @@ struct rcu_data {
  bool idle_timer_active;
  
  boolprocess_callbacks;

+boolbarrier_active;
  };
  
  /*

@@ -143,51 +143,82 @@ static int qhimark = 1;
  static int qlowmark = 100;
  static int rsinterval = 1000;
  
-struct rcu_barrier_data {

-struct rcu_head head;
-atomic_t *cpu_count;
-};
+/*
+ * rcu_barrier() handling:
+ * cpu_count holds the number of cpu required to finish barrier handling.


NIT: the number of cpus (I think)


+ * pending_count is initialized to nr_cpus + 1.
+ * Cpus are synchronized via softirq mechanism. rcu_barrier() is regarded to
+ * be active if pending_count is not zero. In case rcu_barrier() is called on
+ * multiple cpus it is enough to check for pending_count being not zero on 
entry
+ * and to call process_pending_softirqs() in a loop until pending_count drops 
to
+ * zero, before starting the new rcu_barrier() processing.
+ * In order to avoid hangs when rcu_barrier() is called multiple times on the
+ * same cpu in fast sequence and a slave cpu couldn't drop out of the
+ * barrier handling fast enough a second counter pending_count is needed.


As an aside question, don't we miss a memory barrier in 
rcu_barrier_callback or rcu_barrier_action()? This barrier would ensure 
that the RCU changes have been seen before we tell the "master" CPU we 
are done.



+ * The rcu_barrier() invoking cpu will wait until pending_count reaches 1
+ * (meaning that all cpus have finished processing the barrier) and then will
+ * reset pending_count to 0 to enable entering rcu_barrier() again.
+ */
+static atomic_t cpu_count = ATOMIC_INIT(0);
+static atomic_t pending_count = ATOMIC_INIT(0);
  
  static void rcu_barrier_callback(struct rcu_head *head)

  {
-struct rcu_barrier_data *data = container_of(
-head, struct rcu_barrier_data, head);
-atomic_inc(data->cpu_count);
+atomic_dec(_count);
  }
  
-static int rcu_barrier_action(void *_cpu_count)

+static void rcu_barrier_action(void)
  {
-struct rcu_barrier_data data = { .cpu_count = _cpu_count };
-
-ASSERT(!local_irq_is_enabled());
-local_irq_enable();
+struct rcu_head head;
  
  /*

   * When callback is executed, all previously-queued RCU work on this CPU
- * is completed. When all CPUs have executed their callback, data.cpu_count
- * will have been incremented to include every online CPU.
+ * is completed. When all CPUs have executed their callback, cpu_count
+ * will have been decremented to 0.
   */
-call_rcu(, rcu_barrier_callback);
+call_rcu(, rcu_barrier_callback);
  
-while ( atomic_read(data.cpu_count) != num_online_cpus() )

+while ( atomic_read(_count) )
  {
  process_pending_softirqs();
  cpu_relax();
  }
  
-local_irq_disable();

-
-return 0;
+atomic_dec(_count);
  }
  
-/*

- * As rcu_barrier() is using stop_machine_run() it is allowed to be used in
- * idle context only (see comment for stop_machine_run()).
- */
-int rcu_barrier(void)
+void rcu_barrier(void)
  {
-atomic_t cpu_count = ATOMIC_INIT(0);
-return stop_machine_run(rcu_barrier_action, _count, NR_CPUS);
+

[Xen-devel] [PATCH v5 1/4] xen/rcu: don't use stop_machine_run() for rcu_barrier()

2020-03-12 Thread Juergen Gross
Today rcu_barrier() is calling stop_machine_run() to synchronize all
physical cpus in order to ensure all pending rcu calls have finished
when returning.

As stop_machine_run() is using tasklets this requires scheduling of
idle vcpus on all cpus imposing the need to call rcu_barrier() on idle
cpus only in case of core scheduling being active, as otherwise a
scheduling deadlock would occur.

There is no need at all to do the syncing of the cpus in tasklets, as
rcu activity is started in __do_softirq() called whenever softirq
activity is allowed. So rcu_barrier() can easily be modified to use
softirq for synchronization of the cpus no longer requiring any
scheduling activity.

As there already is a rcu softirq reuse that for the synchronization.

Remove the barrier element from struct rcu_data as it isn't used.

Finally switch rcu_barrier() to return void as it now can never fail.

Partially-based-on-patch-by: Igor Druzhinin 
Signed-off-by: Juergen Gross 
---
V2:
- add recursion detection

V3:
- fix races (Igor Druzhinin)

V5:
- rename done_count to pending_count (Jan Beulich)
- fix race (Jan Beulich)
---
 xen/common/rcupdate.c  | 92 --
 xen/include/xen/rcupdate.h |  2 +-
 2 files changed, 66 insertions(+), 28 deletions(-)

diff --git a/xen/common/rcupdate.c b/xen/common/rcupdate.c
index 03d84764d2..c5ef6acb1e 100644
--- a/xen/common/rcupdate.c
+++ b/xen/common/rcupdate.c
@@ -83,7 +83,6 @@ struct rcu_data {
 struct rcu_head **donetail;
 longblimit;   /* Upper limit on a processed batch */
 int cpu;
-struct rcu_head barrier;
 longlast_rs_qlen; /* qlen during the last resched */
 
 /* 3) idle CPUs handling */
@@ -91,6 +90,7 @@ struct rcu_data {
 bool idle_timer_active;
 
 boolprocess_callbacks;
+boolbarrier_active;
 };
 
 /*
@@ -143,51 +143,82 @@ static int qhimark = 1;
 static int qlowmark = 100;
 static int rsinterval = 1000;
 
-struct rcu_barrier_data {
-struct rcu_head head;
-atomic_t *cpu_count;
-};
+/*
+ * rcu_barrier() handling:
+ * cpu_count holds the number of cpu required to finish barrier handling.
+ * pending_count is initialized to nr_cpus + 1.
+ * Cpus are synchronized via softirq mechanism. rcu_barrier() is regarded to
+ * be active if pending_count is not zero. In case rcu_barrier() is called on
+ * multiple cpus it is enough to check for pending_count being not zero on 
entry
+ * and to call process_pending_softirqs() in a loop until pending_count drops 
to
+ * zero, before starting the new rcu_barrier() processing.
+ * In order to avoid hangs when rcu_barrier() is called multiple times on the
+ * same cpu in fast sequence and a slave cpu couldn't drop out of the
+ * barrier handling fast enough a second counter pending_count is needed.
+ * The rcu_barrier() invoking cpu will wait until pending_count reaches 1
+ * (meaning that all cpus have finished processing the barrier) and then will
+ * reset pending_count to 0 to enable entering rcu_barrier() again.
+ */
+static atomic_t cpu_count = ATOMIC_INIT(0);
+static atomic_t pending_count = ATOMIC_INIT(0);
 
 static void rcu_barrier_callback(struct rcu_head *head)
 {
-struct rcu_barrier_data *data = container_of(
-head, struct rcu_barrier_data, head);
-atomic_inc(data->cpu_count);
+atomic_dec(_count);
 }
 
-static int rcu_barrier_action(void *_cpu_count)
+static void rcu_barrier_action(void)
 {
-struct rcu_barrier_data data = { .cpu_count = _cpu_count };
-
-ASSERT(!local_irq_is_enabled());
-local_irq_enable();
+struct rcu_head head;
 
 /*
  * When callback is executed, all previously-queued RCU work on this CPU
- * is completed. When all CPUs have executed their callback, data.cpu_count
- * will have been incremented to include every online CPU.
+ * is completed. When all CPUs have executed their callback, cpu_count
+ * will have been decremented to 0.
  */
-call_rcu(, rcu_barrier_callback);
+call_rcu(, rcu_barrier_callback);
 
-while ( atomic_read(data.cpu_count) != num_online_cpus() )
+while ( atomic_read(_count) )
 {
 process_pending_softirqs();
 cpu_relax();
 }
 
-local_irq_disable();
-
-return 0;
+atomic_dec(_count);
 }
 
-/*
- * As rcu_barrier() is using stop_machine_run() it is allowed to be used in
- * idle context only (see comment for stop_machine_run()).
- */
-int rcu_barrier(void)
+void rcu_barrier(void)
 {
-atomic_t cpu_count = ATOMIC_INIT(0);
-return stop_machine_run(rcu_barrier_action, _count, NR_CPUS);
+unsigned int n_cpus;
+
+for ( ;; )
+{
+if ( !atomic_read(_count) && get_cpu_maps() )
+{
+n_cpus = num_online_cpus();
+
+if ( atomic_cmpxchg(_count, 0, n_cpus + 1) == 0 )
+break;
+
+put_cpu_maps();
+}
+
+process_pending_softirqs();
+cpu_relax();
+}
+
+