On 2015/11/26 5:12, Thomas Gleixner wrote:
> On Wed, 25 Nov 2015, Thomas Gleixner wrote:
>> So if CPU1 gets the IPI _BEFORE_ move_in_progress is set to 0, and
>> does not get another IPI before the next move ..... That has been that
>> way forever.
>>
>> Duh. Working on a real fix this time.
> 
> Here you go. Completely untested of course.
> 
> Larger than I hoped for, but the simple fix of just clearing the
> move_in_progress flag before sending the IPI does not work because:
> 
> CPU0                        CPU1                      CPU2
> data->move_in_progress=0
> sendIPI()                     
>                             set_affinity()
>                             lock_vector()             handle_IPI
>                             move_in_progress = 1      lock_vector()
>                             unlock_vector()
>                                                       move_in_progress == 1
>                                                       -> no cleanup
> 
> So we are back to square one. Now one might think that taking vector
> lock prevents that issue:
> 
> CPU0                        CPU1                      CPU2
> lock_vector()
> data->move_in_progress=0
> sendIPI()                     
> unlock_vector()
>                             set_affinity()
>                             assign_irq_vector()
>                             lock_vector()             handle_IPI
>                             move_in_progress = 1      lock_vector()
>                             unlock_vector()
>                                                       move_in_progress == 1
> Not really. 
> 
> So now the solution is:
> 
> CPU0                        CPU1                      CPU2
> lock_vector()
> data->move_in_progress=0
> data->cleanup_mask = data->old_domain
> sendIPI()                     
> unlock_vector()
>                             set_affinity()
>                             assign_irq_vector()
>                             lock_vector()             
>                             if (move_in_progress ||
>                                 !empty(cleanup_mask)) {
>                                unlock_vector()
>                                return -EBUSY;         handle_IPI
>                             }                         lock_vector()
>                                                       move_in_progress == 0
>                                                       cpu is set in cleanup 
> mask
>                                                       ->cleanup vector
> 
> Looks a bit overkill with the extra cpumask. I tried a simple counter
> but that does not work versus cpu unplug as we do not know whether the
> outgoing cpu is involved in the cleanup or not. And if the cpu is
> involved we starve assign_irq_vector() ....
> 
> The upside of this is that we get rid of that atomic allocation in
> __send_cleanup_vector().
Hi Thomas,
        Maybe more headache for you now:)
        It seems there are still rooms for improvements. First it
seems we could just reuse old_domain instead of adding cleanup_mask.
Second I found another race window among x86_vector_free_irqs(),
__send_cleanup_vector() and smp_irq_move_cleanup_interrupt().
I'm trying to refine your patch based following rules:
1) move_in_progress controls whether we need to send IPIs
2) old_domain controls which CPUs we should do clean up
3) assign_irq_vector checks both move_in_progress and old_domain.
Will send out the patch soon for comments:)
Thanks,
Gerry                   

> 
> Brain hurts by now. 
> 
> Not-Yet-Signed-off-by: Thomas Gleixner <[email protected]>
> ---
>  arch/x86/kernel/apic/vector.c |   37 ++++++++++++++++---------------------
>  1 file changed, 16 insertions(+), 21 deletions(-)
> 
> --- a/arch/x86/kernel/apic/vector.c
> +++ b/arch/x86/kernel/apic/vector.c
> @@ -25,6 +25,7 @@ struct apic_chip_data {
>       struct irq_cfg          cfg;
>       cpumask_var_t           domain;
>       cpumask_var_t           old_domain;
> +     cpumask_var_t           cleanup_mask;
>       u8                      move_in_progress : 1;
>  };
>  
> @@ -83,7 +84,11 @@ static struct apic_chip_data *alloc_apic
>               goto out_data;
>       if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
>               goto out_domain;
> +     if (!zalloc_cpumask_var_node(&data->cleanup_mask, GFP_KERNEL, node))
> +             goto out_old;
>       return data;
> +out_old:
> +     free_cpumask_var(data->old_domain);
>  out_domain:
>       free_cpumask_var(data->domain);
>  out_data:
> @@ -96,6 +101,7 @@ static void free_apic_chip_data(struct a
>       if (data) {
>               free_cpumask_var(data->domain);
>               free_cpumask_var(data->old_domain);
> +             free_cpumask_var(data->cleanup_mask);
>               kfree(data);
>       }
>  }
> @@ -118,7 +124,7 @@ static int __assign_irq_vector(int irq,
>       static int current_offset = VECTOR_OFFSET_START % 16;
>       int cpu, err;
>  
> -     if (d->move_in_progress)
> +     if (d->move_in_progress || !cpumask_empty(d->cleanup_mask))
>               return -EBUSY;
>  
>       /* Only try and allocate irqs on cpus that are present */
> @@ -511,20 +517,11 @@ static struct irq_chip lapic_controller
>  #ifdef CONFIG_SMP
>  static void __send_cleanup_vector(struct apic_chip_data *data)
>  {
> -     cpumask_var_t cleanup_mask;
> -
> -     if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
> -             unsigned int i;
> -
> -             for_each_cpu_and(i, data->old_domain, cpu_online_mask)
> -                     apic->send_IPI_mask(cpumask_of(i),
> -                                         IRQ_MOVE_CLEANUP_VECTOR);
> -     } else {
> -             cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
> -             apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
> -             free_cpumask_var(cleanup_mask);
> -     }
> +     raw_spin_lock(&vector_lock);
> +     cpumask_and(data->cleanup_mask, data->old_domain, cpu_online_mask);
>       data->move_in_progress = 0;
> +     apic->send_IPI_mask(data->cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
> +     raw_spin_unlock(&vector_lock);
>  }
>  
>  void send_cleanup_vector(struct irq_cfg *cfg)
> @@ -568,14 +565,11 @@ asmlinkage __visible void smp_irq_move_c
>                       goto unlock;
>  
>               /*
> -              * Check if the irq migration is in progress. If so, we
> -              * haven't received the cleanup request yet for this irq.
> +              * Nothing to cleanup if irq migration is in progress
> +              * or this cpu is not set in the cleanup mask.
>                */
> -             if (data->move_in_progress)
> -                     goto unlock;
> -
> -             if (vector == data->cfg.vector &&
> -                 cpumask_test_cpu(me, data->domain))
> +             if (data->move_in_progress ||
> +                 !cpumask_test_cpu(me, data->cleanup_mask))
>                       goto unlock;
>  
>               irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
> @@ -591,6 +585,7 @@ asmlinkage __visible void smp_irq_move_c
>                       goto unlock;
>               }
>               __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
> +             cpumask_clear_cpu(me, data->cleanup_mask);
>  unlock:
>               raw_spin_unlock(&desc->lock);
>       }
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to