Re: [Xen-devel] [PATCH 2/8] x86: distinguish CPU offlining from CPU removal

2018-07-13 Thread Wei Liu
On Thu, Jul 12, 2018 at 05:48:40AM -0600, Jan Beulich wrote:
> >>> On 12.07.18 at 12:53,  wrote:
> > On Wed, Jul 11, 2018 at 06:06:04AM -0600, Jan Beulich wrote:
> > [...]
> >> --- a/xen/arch/x86/smpboot.c
> >> +++ b/xen/arch/x86/smpboot.c
> >> @@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask;
> >>  cpumask_t cpu_online_map __read_mostly;
> >>  EXPORT_SYMBOL(cpu_online_map);
> >>  
> >> +bool __read_mostly park_offline_cpus;
> >> +
> >>  unsigned int __read_mostly nr_sockets;
> >>  cpumask_t **__read_mostly socket_cpumask;
> >>  static cpumask_t *secondary_socket_cpumask;
> >> @@ -887,7 +889,7 @@ static void cleanup_cpu_root_pgt(unsigne
> >>  }
> >>  }
> >>  
> >> -static void cpu_smpboot_free(unsigned int cpu)
> >> +static void cpu_smpboot_free(unsigned int cpu, bool all)
> > 
> > I think "all" is too vague. It doesn't convey the idea what constitutes
> > "partial". But I don't have any better suggestion either.
> 
> Indeed I've been trying to come up with a better name before
> posting, but couldn't. One thing though - I don't think the name
> should in anyway be required to express what "partial" means.
> 

A sentence or two to describe what !all is for would be helpful.

Wei.

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH 2/8] x86: distinguish CPU offlining from CPU removal

2018-07-12 Thread Andrew Cooper
On 11/07/18 13:06, Jan Beulich wrote:
> In order to be able to service #MC on offlined CPUs, GDT, IDT, stack,

For clarity, I'd phrase this as "CPUs, the GDT, ..." to help visually
separate CPUs as it isn't a part of the rest of the list.

> --- a/xen/arch/x86/genapic/x2apic.c
> +++ b/xen/arch/x86/genapic/x2apic.c
> @@ -201,18 +201,25 @@ static int update_clusterinfo(
>  if ( !cluster_cpus_spare )
>  cluster_cpus_spare = xzalloc(cpumask_t);
>  if ( !cluster_cpus_spare ||
> - !alloc_cpumask_var(_cpu(scratch_mask, cpu)) )
> + !cond_alloc_cpumask_var(_cpu(scratch_mask, cpu)) )
>  err = -ENOMEM;
>  break;
>  case CPU_UP_CANCELED:
>  case CPU_DEAD:
> +case CPU_REMOVE:
> +if ( park_offline_cpus == (action != CPU_REMOVE) )
> +break;
>  if ( per_cpu(cluster_cpus, cpu) )
>  {
>  cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu));
>  if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) )
> +{
>  xfree(per_cpu(cluster_cpus, cpu));
> +per_cpu(cluster_cpus, cpu) = NULL;
> +}
>  }
>  free_cpumask_var(per_cpu(scratch_mask, cpu));
> +clear_cpumask_var(_cpu(scratch_mask, cpu));

freeing and NULL-ing the pointer at the same time is already a common
pattern  How about introducing

/* Free an allocation, and zero the pointer to it. */
#define XFREE(p)
({
    typeof(*p) *_p = (p);

    xfree(_p);
    _p = NULL;
})

and a similar wrapper for FREE_CPUMASK_VAR(p) which takes care of the
NULL-ing as well?

In time as these start to get used more widely, it should reduce the
chances of double-freeing in cleanup paths.

> @@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask;
>  cpumask_t cpu_online_map __read_mostly;
>  EXPORT_SYMBOL(cpu_online_map);
>  
> +bool __read_mostly park_offline_cpus;
> +
>  unsigned int __read_mostly nr_sockets;
>  cpumask_t **__read_mostly socket_cpumask;
>  static cpumask_t *secondary_socket_cpumask;
> @@ -887,7 +889,7 @@ static void cleanup_cpu_root_pgt(unsigne
>  }
>  }
>  
> -static void cpu_smpboot_free(unsigned int cpu)
> +static void cpu_smpboot_free(unsigned int cpu, bool all)

Perhaps "remove" or "remove_cpu", to match the CPU_REMOVE terminology?

Also, we probably want a comment here explaining the difference between
parking and removing in terms of what infrastructure needs to remain valid.

~Andrew

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH 2/8] x86: distinguish CPU offlining from CPU removal

2018-07-12 Thread Jan Beulich
>>> On 12.07.18 at 12:53,  wrote:
> On Wed, Jul 11, 2018 at 06:06:04AM -0600, Jan Beulich wrote:
> [...]
>> --- a/xen/arch/x86/smpboot.c
>> +++ b/xen/arch/x86/smpboot.c
>> @@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask;
>>  cpumask_t cpu_online_map __read_mostly;
>>  EXPORT_SYMBOL(cpu_online_map);
>>  
>> +bool __read_mostly park_offline_cpus;
>> +
>>  unsigned int __read_mostly nr_sockets;
>>  cpumask_t **__read_mostly socket_cpumask;
>>  static cpumask_t *secondary_socket_cpumask;
>> @@ -887,7 +889,7 @@ static void cleanup_cpu_root_pgt(unsigne
>>  }
>>  }
>>  
>> -static void cpu_smpboot_free(unsigned int cpu)
>> +static void cpu_smpboot_free(unsigned int cpu, bool all)
> 
> I think "all" is too vague. It doesn't convey the idea what constitutes
> "partial". But I don't have any better suggestion either.

Indeed I've been trying to come up with a better name before
posting, but couldn't. One thing though - I don't think the name
should in anyway be required to express what "partial" means.

>> @@ -898,15 +900,24 @@ static void cpu_smpboot_free(unsigned in
>>  socket_cpumask[socket] = NULL;
>>  }
>>  
>> -c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
>> -c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
>> -c[cpu].compute_unit_id = INVALID_CUID;
>>  cpumask_clear_cpu(cpu, _sibling_setup_map);
>>  
>> -free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
>> -free_cpumask_var(per_cpu(cpu_core_mask, cpu));
>> -if ( per_cpu(scratch_cpumask, cpu) != _cpu0mask )
>> -free_cpumask_var(per_cpu(scratch_cpumask, cpu));
>> +if ( all )
>> +{
>> +c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
>> +c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
>> +c[cpu].compute_unit_id = INVALID_CUID;
>> +
>> +free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
>> +clear_cpumask_var(_cpu(cpu_sibling_mask, cpu));
>> +free_cpumask_var(per_cpu(cpu_core_mask, cpu));
>> +clear_cpumask_var(_cpu(cpu_core_mask, cpu));
>> +if ( per_cpu(scratch_cpumask, cpu) != _cpu0mask )
>> +{
>> +free_cpumask_var(per_cpu(scratch_cpumask, cpu));
>> +clear_cpumask_var(_cpu(scratch_cpumask, cpu));
>> +}
> 
> One thing that's not mentioned in the commit message is why various
> masks are kept. I guess they are needed for some other parts of the
> system to remain operational.

Hmm, I've taken the opposite approach: Free in "partial" mode
only what I could prove won't be needed.

Jan



___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH 2/8] x86: distinguish CPU offlining from CPU removal

2018-07-12 Thread Wei Liu
On Wed, Jul 11, 2018 at 06:06:04AM -0600, Jan Beulich wrote:
[...]
> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask;
>  cpumask_t cpu_online_map __read_mostly;
>  EXPORT_SYMBOL(cpu_online_map);
>  
> +bool __read_mostly park_offline_cpus;
> +
>  unsigned int __read_mostly nr_sockets;
>  cpumask_t **__read_mostly socket_cpumask;
>  static cpumask_t *secondary_socket_cpumask;
> @@ -887,7 +889,7 @@ static void cleanup_cpu_root_pgt(unsigne
>  }
>  }
>  
> -static void cpu_smpboot_free(unsigned int cpu)
> +static void cpu_smpboot_free(unsigned int cpu, bool all)

I think "all" is too vague. It doesn't convey the idea what constitutes
"partial". But I don't have any better suggestion either.

>  {
>  unsigned int order, socket = cpu_to_socket(cpu);
>  struct cpuinfo_x86 *c = cpu_data;
> @@ -898,15 +900,24 @@ static void cpu_smpboot_free(unsigned in
>  socket_cpumask[socket] = NULL;
>  }
>  
> -c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
> -c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
> -c[cpu].compute_unit_id = INVALID_CUID;
>  cpumask_clear_cpu(cpu, _sibling_setup_map);
>  
> -free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
> -free_cpumask_var(per_cpu(cpu_core_mask, cpu));
> -if ( per_cpu(scratch_cpumask, cpu) != _cpu0mask )
> -free_cpumask_var(per_cpu(scratch_cpumask, cpu));
> +if ( all )
> +{
> +c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
> +c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
> +c[cpu].compute_unit_id = INVALID_CUID;
> +
> +free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
> +clear_cpumask_var(_cpu(cpu_sibling_mask, cpu));
> +free_cpumask_var(per_cpu(cpu_core_mask, cpu));
> +clear_cpumask_var(_cpu(cpu_core_mask, cpu));
> +if ( per_cpu(scratch_cpumask, cpu) != _cpu0mask )
> +{
> +free_cpumask_var(per_cpu(scratch_cpumask, cpu));
> +clear_cpumask_var(_cpu(scratch_cpumask, cpu));
> +}

One thing that's not mentioned in the commit message is why various
masks are kept. I guess they are needed for some other parts of the
system to remain operational.

The rest looks fine to me.

Wei.

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH 2/8] x86: distinguish CPU offlining from CPU removal

2018-07-11 Thread Jan Beulich
In order to be able to service #MC on offlined CPUs, GDT, IDT, stack,
and per-CPU data (which includes the TSS) need to be kept allocated.
They should only be freed upon CPU removal (which we currently don't
support, so some code is becoming effectively dead for the moment).

Signed-off-by: Jan Beulich 

--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -692,12 +692,15 @@ static void cpu_bank_free(unsigned int c
 
 mcabanks_free(poll);
 mcabanks_free(clr);
+
+per_cpu(poll_bankmask, cpu) = NULL;
+per_cpu(mce_clear_banks, cpu) = NULL;
 }
 
 static int cpu_bank_alloc(unsigned int cpu)
 {
-struct mca_banks *poll = mcabanks_alloc();
-struct mca_banks *clr = mcabanks_alloc();
+struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc();
+struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc();
 
 if ( !poll || !clr )
 {
@@ -725,7 +728,13 @@ static int cpu_callback(
 
 case CPU_UP_CANCELED:
 case CPU_DEAD:
-cpu_bank_free(cpu);
+if ( !park_offline_cpus )
+cpu_bank_free(cpu);
+break;
+
+case CPU_REMOVE:
+if ( park_offline_cpus )
+cpu_bank_free(cpu);
 break;
 }
 
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -107,10 +107,11 @@ static void play_dead(void)
 local_irq_disable();
 
 /*
- * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible,
- * as they may be freed at any time. In this case, heap corruption or
- * #PF can occur (when heap debugging is enabled). For example, even
- * printk() can involve tasklet scheduling, which touches per-cpu vars.
+ * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible,
+ * as they may be freed at any time if offline CPUs don't get parked. In
+ * this case, heap corruption or #PF can occur (when heap debugging is
+ * enabled). For example, even printk() can involve tasklet scheduling,
+ * which touches per-cpu vars.
  * 
  * Consider very carefully when adding code to *dead_idle. Most hypervisor
  * subsystems are unsafe to call.
--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -201,18 +201,25 @@ static int update_clusterinfo(
 if ( !cluster_cpus_spare )
 cluster_cpus_spare = xzalloc(cpumask_t);
 if ( !cluster_cpus_spare ||
- !alloc_cpumask_var(_cpu(scratch_mask, cpu)) )
+ !cond_alloc_cpumask_var(_cpu(scratch_mask, cpu)) )
 err = -ENOMEM;
 break;
 case CPU_UP_CANCELED:
 case CPU_DEAD:
+case CPU_REMOVE:
+if ( park_offline_cpus == (action != CPU_REMOVE) )
+break;
 if ( per_cpu(cluster_cpus, cpu) )
 {
 cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu));
 if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) )
+{
 xfree(per_cpu(cluster_cpus, cpu));
+per_cpu(cluster_cpus, cpu) = NULL;
+}
 }
 free_cpumask_var(per_cpu(scratch_mask, cpu));
+clear_cpumask_var(_cpu(scratch_mask, cpu));
 break;
 }
 
--- a/xen/arch/x86/percpu.c
+++ b/xen/arch/x86/percpu.c
@@ -28,7 +28,7 @@ static int init_percpu_area(unsigned int
 char *p;
 
 if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA )
-return -EBUSY;
+return 0;
 
 if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL )
 return -ENOMEM;
@@ -76,9 +76,12 @@ static int cpu_percpu_callback(
 break;
 case CPU_UP_CANCELED:
 case CPU_DEAD:
-free_percpu_area(cpu);
+if ( !park_offline_cpus )
+free_percpu_area(cpu);
 break;
-default:
+case CPU_REMOVE:
+if ( park_offline_cpus )
+free_percpu_area(cpu);
 break;
 }
 
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask;
 cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
+bool __read_mostly park_offline_cpus;
+
 unsigned int __read_mostly nr_sockets;
 cpumask_t **__read_mostly socket_cpumask;
 static cpumask_t *secondary_socket_cpumask;
@@ -887,7 +889,7 @@ static void cleanup_cpu_root_pgt(unsigne
 }
 }
 
-static void cpu_smpboot_free(unsigned int cpu)
+static void cpu_smpboot_free(unsigned int cpu, bool all)
 {
 unsigned int order, socket = cpu_to_socket(cpu);
 struct cpuinfo_x86 *c = cpu_data;
@@ -898,15 +900,24 @@ static void cpu_smpboot_free(unsigned in
 socket_cpumask[socket] = NULL;
 }
 
-c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
-c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
-c[cpu].compute_unit_id = INVALID_CUID;
 cpumask_clear_cpu(cpu, _sibling_setup_map);
 
-free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
-free_cpumask_var(per_cpu(cpu_core_mask, cpu));
-if ( per_cpu(scratch_cpumask, cpu) != _cpu0mask )
-