Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-21 Thread Thomas Gleixner
Andy Lutomirski  writes:
> On Wed, May 20, 2020 at 12:17 PM Thomas Gleixner  wrote:
>>
>> Andy Lutomirski  writes:
>> > Andrew Cooper pointed out that there is too much magic in Xen for this
>> > to work.  So never mind.
>>
>> :)
>>
>> But you made me stare more at that stuff and I came up with a way
>> simpler solution. See below.
>
> I like it, but I bet it can be even simpler if you do the
> tickle_whatever_paulmck_call_it() change:
>
>> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
>> +{
>> +   struct pt_regs *old_regs;
>> +   bool inhcall;
>> +
>> +   idtentry_enter(regs);
>> +   old_regs = set_irq_regs(regs);
>> +
>> +   run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL, regs);
>> +
>> +   set_irq_regs(old_regs);
>> +
>> +   inhcall = get_and_clear_inhcall();
>> +   __idtentry_exit(regs, inhcall);
>> +   restore_inhcall(inhcall);
>
> How about:
>
>inhcall = get_and_clear_inhcall();
>if (inhcall) {
> if (!WARN_ON_ONCE((regs->flags & X86_EFLAGS_IF) || preempt_count()) {
>   local_irq_enable();
>   cond_resched();
>   local_irq_disable();

This really want's to use preempt_schedule_irq() as the above is racy
vs. need_resched().

> }
>  }
>  restore_inhcall(inhcall);
>  idtentry_exit(regs);
>
> This could probably be tidied up by having a xen_maybe_preempt() that
> does the inhcall and resched mess.
>
> The point is that, with the tickle_nohz_ stuff, there is nothing
> actually preventing IRQ handlers from sleeping as long as they aren't
> on the IRQ stack and as long as the interrupted context was safe to
> sleep in.

You still lose the debug checks. I'm working on it ...

Thanks,

tglx


Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-21 Thread Thomas Gleixner
Boris Ostrovsky  writes:

> On 5/20/20 3:16 PM, Thomas Gleixner wrote:
>
>
>> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
>> +{
>> +struct pt_regs *old_regs;
>> +bool inhcall;
>> +
>> +idtentry_enter(regs);
>> +old_regs = set_irq_regs(regs);
>> +
>> +run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL, regs);
>
>
> We need to handle nested case (i.e. !irq_needs_irq_stack(), like in your
> original version). Moving get_and_clear_inhcall() up should prevent
> scheduling when this happens.

I locally changed run_on_irqstack() to do the magic checks and select the
right one.

Thanks,

tglx


Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-20 Thread Boris Ostrovsky
On 5/20/20 3:16 PM, Thomas Gleixner wrote:


> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
> +{
> + struct pt_regs *old_regs;
> + bool inhcall;
> +
> + idtentry_enter(regs);
> + old_regs = set_irq_regs(regs);
> +
> + run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL, regs);


We need to handle nested case (i.e. !irq_needs_irq_stack(), like in your
original version). Moving get_and_clear_inhcall() up should prevent
scheduling when this happens.


-boris


> +
> + set_irq_regs(old_regs);
> +
> + inhcall = get_and_clear_inhcall();
> + __idtentry_exit(regs, inhcall);
> + restore_inhcall(inhcall);
>  }



Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-20 Thread Andy Lutomirski
On Wed, May 20, 2020 at 12:17 PM Thomas Gleixner  wrote:
>
> Andy Lutomirski  writes:
> > Andrew Cooper pointed out that there is too much magic in Xen for this
> > to work.  So never mind.
>
> :)
>
> But you made me stare more at that stuff and I came up with a way
> simpler solution. See below.

I like it, but I bet it can be even simpler if you do the
tickle_whatever_paulmck_call_it() change:

> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
> +{
> +   struct pt_regs *old_regs;
> +   bool inhcall;
> +
> +   idtentry_enter(regs);
> +   old_regs = set_irq_regs(regs);
> +
> +   run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL, regs);
> +
> +   set_irq_regs(old_regs);
> +
> +   inhcall = get_and_clear_inhcall();
> +   __idtentry_exit(regs, inhcall);
> +   restore_inhcall(inhcall);

How about:

   inhcall = get_and_clear_inhcall();
   if (inhcall) {
if (!WARN_ON_ONCE((regs->flags & X86_EFLAGS_IF) || preempt_count()) {
  local_irq_enable();
  cond_resched();
  local_irq_disable();
}
 }
 restore_inhcall(inhcall);
 idtentry_exit(regs);

This could probably be tidied up by having a xen_maybe_preempt() that
does the inhcall and resched mess.

The point is that, with the tickle_nohz_ stuff, there is nothing
actually preventing IRQ handlers from sleeping as long as they aren't
on the IRQ stack and as long as the interrupted context was safe to
sleep in.

--Andy


Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-20 Thread Thomas Gleixner
Andy Lutomirski  writes:
> Andrew Cooper pointed out that there is too much magic in Xen for this
> to work.  So never mind.

:)

But you made me stare more at that stuff and I came up with a way
simpler solution. See below.

Thanks,

tglx

8<--

 arch/x86/entry/common.c |   75 ++--
 arch/x86/entry/entry_32.S   |   17 -
 arch/x86/entry/entry_64.S   |   22 +++
 arch/x86/include/asm/idtentry.h |   13 ++
 arch/x86/xen/setup.c|4 +-
 arch/x86/xen/smp_pv.c   |3 +
 arch/x86/xen/xen-asm_32.S   |   12 +++---
 arch/x86/xen/xen-asm_64.S   |2 -
 arch/x86/xen/xen-ops.h  |1 
 drivers/xen/Makefile|2 -
 drivers/xen/preempt.c   |   42 --
 11 files changed, 115 insertions(+), 78 deletions(-)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -27,6 +27,9 @@
 #include 
 #include 
 
+#include 
+#include 
+
 #include 
 #include 
 #include 
@@ -35,6 +38,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -539,7 +543,8 @@ void noinstr idtentry_enter(struct pt_re
}
 }
 
-static __always_inline void __idtentry_exit(struct pt_regs *regs)
+static __always_inline void __idtentry_exit(struct pt_regs *regs,
+   bool may_sched)
 {
lockdep_assert_irqs_disabled();
 
@@ -548,7 +553,7 @@ static __always_inline void __idtentry_e
prepare_exit_to_usermode(regs);
} else if (regs->flags & X86_EFLAGS_IF) {
/* Check kernel preemption, if enabled */
-   if (IS_ENABLED(CONFIG_PREEMPTION)) {
+   if (IS_ENABLED(CONFIG_PREEMPTION) || may_resched) {
/*
 * This needs to be done very carefully.
 * idtentry_enter() invoked rcu_irq_enter(). This
@@ -612,5 +617,69 @@ static __always_inline void __idtentry_e
  */
 void noinstr idtentry_exit(struct pt_regs *regs)
 {
-   __idtentry_exit(regs);
+   __idtentry_exit(regs, false);
+}
+
+#ifdef CONFIG_XEN_PV
+
+#ifndef CONFIG_PREEMPTION
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+/*
+ * In case of scheduling the flag must be cleared and restored after
+ * returning from schedule as the task might move to a different CPU.
+ */
+static __always_inline bool get_and_clear_inhcall(void)
+{
+   boot inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+   __this_cpu_write(xen_in_preemptible_hcall, false);
+}
+
+static __always_inline void restore_inhcall(bool inhcall)
+{
+   __this_cpu_write(xen_in_preemptible_hcall, inhcall);
+}
+#else
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
+static __always_inline void restore_inhcall(bool inhcall) { }
+#endif
+
+static void __xen_pv_evtchn_do_upcall(void)
+{
+   irq_enter_rcu();
+   inc_irq_stat(irq_hv_callback_count);
+
+   xen_hvm_evtchn_do_upcall();
+
+   irq_exit_rcu();
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+   struct pt_regs *old_regs;
+   bool inhcall;
+
+   idtentry_enter(regs);
+   old_regs = set_irq_regs(regs);
+
+   run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL, regs);
+
+   set_irq_regs(old_regs);
+
+   inhcall = get_and_clear_inhcall();
+   __idtentry_exit(regs, inhcall);
+   restore_inhcall(inhcall);
 }
+#endif /* CONFIG_XEN_PV */
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1298,7 +1298,10 @@ SYM_CODE_END(native_iret)
 #endif
 
 #ifdef CONFIG_XEN_PV
-SYM_FUNC_START(xen_hypervisor_callback)
+/*
+ * See comment in entry_64.S for further explanation
+ */
+SYM_FUNC_START(exc_xen_hypervisor_callback)
/*
 * Check to see if we got the event in the critical
 * region in xen_iret_direct, after we've reenabled
@@ -1315,14 +1318,11 @@ SYM_FUNC_START(xen_hypervisor_callback)
pushl   $-1 /* orig_ax = -1 => not a system 
call */
SAVE_ALL
ENCODE_FRAME_POINTER
-   TRACE_IRQS_OFF
+
mov %esp, %eax
-   callxen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPTION
-   callxen_maybe_preempt_hcall
-#endif
-   jmp ret_from_intr
-SYM_FUNC_END(xen_hypervisor_callback)
+   callxen_pv_evtchn_do_upcall
+   jmp handle_exception_return
+SYM_FUNC_END(exc_xen_hypervisor_callback)
 
 /*
  * Hypervisor uses this for application faults while it executes.
@@ -1464,6 +1464,7 @@ 

Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-20 Thread Andy Lutomirski
On Wed, May 20, 2020 at 8:16 AM Andy Lutomirski  wrote:
>
> On Wed, May 20, 2020 at 7:13 AM Thomas Gleixner  wrote:
> >
> > Andy Lutomirski  writes:
> > > On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner  
> > > wrote:
> > >> Which brings you into the situation that you call schedule() from the
> > >> point where we just moved it out. If we would go there we'd need to
> > >> ensure that RCU is watching as well. idtentry_exit() might have it
> > >> turned off 
> > >
> > > I don't think this is possible.  Once you untangle all the wrappers,
> > > the call sites are effectively:
> > >
> > > __this_cpu_write(xen_in_preemptible_hcall, true);
> > > CALL_NOSPEC to the hypercall page
> > > __this_cpu_write(xen_in_preemptible_hcall, false);
> > >
> > > I think IF=1 when this happens, but I won't swear to it.  RCU had
> > > better be watching.
> > >
> > > As I understand it, the one and only situation Xen wants to handle is
> > > that an interrupt gets delivered during the hypercall.  The hypervisor
> > > is too clever for its own good and deals with this by rewinding RIP to
> > > the beginning of whatever instruction did the hypercall and delivers
> > > the interrupt, and we end up in this handler.  So, if this happens,
> > > the idea is to not only handle the interrupt but to schedule if
> > > scheduling would be useful.
> > >
> > > So I don't think we need all this RCU magic.  This really ought to be
> > > able to be simplified to:
> > >
> > > idtentry_exit();
> > >
> > > if (appropriate condition)
> > >   schedule();
> >
> > This is exactly the kind of tinkering which causes all kinds of trouble.
> >
> > idtentry_exit()
> >
> > if (user_mode(regs)) {
> > prepare_exit_to_usermode(regs);
> > } else if (regs->flags & X86_EFLAGS_IF) {
> > /* Check kernel preemption, if enabled */
> > if (IS_ENABLED(CONFIG_PREEMPTION)) {
> > 
> > }
> > instrumentation_begin();
> > /* Tell the tracer that IRET will enable interrupts */
> > trace_hardirqs_on_prepare();
> > lockdep_hardirqs_on_prepare(CALLER_ADDR0);
> > instrumentation_end();
> > rcu_irq_exit();
> > lockdep_hardirqs_on(CALLER_ADDR0);
> > } else {
> > /* IRQ flags state is correct already. Just tell RCU */
> > rcu_irq_exit();
> > }
> >
> > So in case IF is set then this already told the tracer and lockdep that
> > interrupts are enabled. And contrary to the ugly version this exit path
> > does not use rcu_irq_exit_preempt() which is there to warn about crappy
> > RCU state when trying to schedule.
> >
> > So we went great length to sanitize _all_ of this and make it consistent
> > just to say: screw it for that xen thingy.
> >
> > The extra checks and extra warnings for scheduling come with the
> > guarantee to bitrot when idtentry_exit() or any logic invoked from there
> > is changed. It's going to look like this:
> >
> > /*
> >  * If the below causes problems due to inconsistent state
> >  * or out of sync sanity checks, please complain to
> >  * l...@kernel.org directly.
> >  */
> > idtentry_exit();
> >
> > if (user_mode(regs) || !(regs->flags & X86_FlAGS_IF))
> > return;
> >
> > if (!__this_cpu_read(xen_in_preemptible_hcall))
> > return;
> >
> > rcu_sanity_check_for_preemption();
> >
> > if (need_resched()) {
> > instrumentation_begin();
> > xen_maybe_preempt_hcall();
> > trace_hardirqs_on();
> > instrumentation_end();
> > }
> >
> > Of course you need the extra rcu_sanity_check_for_preemption() function
> > just for this muck.
> >
> > That's a true win on all ends? I don't think so.
>
> Hmm, fair enough.  I guess the IRQ tracing messes a bunch of this logic up.
>
> Let's keep your patch as is and consider cleanups later.  One approach
> might be to make this work more like extable handling: instead of
> trying to schedule from inside the interrupt handler here, patch up
> RIP and perhaps some other registers and let the actual Xen code just
> do cond_resched().  IOW, try to make this work the way it always
> should have:
>
> int ret;
> do {
>   ret = issue_the_hypercall();
>   cond_resched();
> } while (ret == EAGAIN);

Andrew Cooper pointed out that there is too much magic in Xen for this
to work.  So never mind.


Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-20 Thread Andy Lutomirski
On Wed, May 20, 2020 at 7:13 AM Thomas Gleixner  wrote:
>
> Andy Lutomirski  writes:
> > On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner  wrote:
> >> Which brings you into the situation that you call schedule() from the
> >> point where we just moved it out. If we would go there we'd need to
> >> ensure that RCU is watching as well. idtentry_exit() might have it
> >> turned off 
> >
> > I don't think this is possible.  Once you untangle all the wrappers,
> > the call sites are effectively:
> >
> > __this_cpu_write(xen_in_preemptible_hcall, true);
> > CALL_NOSPEC to the hypercall page
> > __this_cpu_write(xen_in_preemptible_hcall, false);
> >
> > I think IF=1 when this happens, but I won't swear to it.  RCU had
> > better be watching.
> >
> > As I understand it, the one and only situation Xen wants to handle is
> > that an interrupt gets delivered during the hypercall.  The hypervisor
> > is too clever for its own good and deals with this by rewinding RIP to
> > the beginning of whatever instruction did the hypercall and delivers
> > the interrupt, and we end up in this handler.  So, if this happens,
> > the idea is to not only handle the interrupt but to schedule if
> > scheduling would be useful.
> >
> > So I don't think we need all this RCU magic.  This really ought to be
> > able to be simplified to:
> >
> > idtentry_exit();
> >
> > if (appropriate condition)
> >   schedule();
>
> This is exactly the kind of tinkering which causes all kinds of trouble.
>
> idtentry_exit()
>
> if (user_mode(regs)) {
> prepare_exit_to_usermode(regs);
> } else if (regs->flags & X86_EFLAGS_IF) {
> /* Check kernel preemption, if enabled */
> if (IS_ENABLED(CONFIG_PREEMPTION)) {
> 
> }
> instrumentation_begin();
> /* Tell the tracer that IRET will enable interrupts */
> trace_hardirqs_on_prepare();
> lockdep_hardirqs_on_prepare(CALLER_ADDR0);
> instrumentation_end();
> rcu_irq_exit();
> lockdep_hardirqs_on(CALLER_ADDR0);
> } else {
> /* IRQ flags state is correct already. Just tell RCU */
> rcu_irq_exit();
> }
>
> So in case IF is set then this already told the tracer and lockdep that
> interrupts are enabled. And contrary to the ugly version this exit path
> does not use rcu_irq_exit_preempt() which is there to warn about crappy
> RCU state when trying to schedule.
>
> So we went great length to sanitize _all_ of this and make it consistent
> just to say: screw it for that xen thingy.
>
> The extra checks and extra warnings for scheduling come with the
> guarantee to bitrot when idtentry_exit() or any logic invoked from there
> is changed. It's going to look like this:
>
> /*
>  * If the below causes problems due to inconsistent state
>  * or out of sync sanity checks, please complain to
>  * l...@kernel.org directly.
>  */
> idtentry_exit();
>
> if (user_mode(regs) || !(regs->flags & X86_FlAGS_IF))
> return;
>
> if (!__this_cpu_read(xen_in_preemptible_hcall))
> return;
>
> rcu_sanity_check_for_preemption();
>
> if (need_resched()) {
> instrumentation_begin();
> xen_maybe_preempt_hcall();
> trace_hardirqs_on();
> instrumentation_end();
> }
>
> Of course you need the extra rcu_sanity_check_for_preemption() function
> just for this muck.
>
> That's a true win on all ends? I don't think so.

Hmm, fair enough.  I guess the IRQ tracing messes a bunch of this logic up.

Let's keep your patch as is and consider cleanups later.  One approach
might be to make this work more like extable handling: instead of
trying to schedule from inside the interrupt handler here, patch up
RIP and perhaps some other registers and let the actual Xen code just
do cond_resched().  IOW, try to make this work the way it always
should have:

int ret;
do {
  ret = issue_the_hypercall();
  cond_resched();
} while (ret == EAGAIN);


Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-20 Thread Thomas Gleixner
Andy Lutomirski  writes:
> On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner  wrote:
>> Which brings you into the situation that you call schedule() from the
>> point where we just moved it out. If we would go there we'd need to
>> ensure that RCU is watching as well. idtentry_exit() might have it
>> turned off 
>
> I don't think this is possible.  Once you untangle all the wrappers,
> the call sites are effectively:
>
> __this_cpu_write(xen_in_preemptible_hcall, true);
> CALL_NOSPEC to the hypercall page
> __this_cpu_write(xen_in_preemptible_hcall, false);
>
> I think IF=1 when this happens, but I won't swear to it.  RCU had
> better be watching.
>
> As I understand it, the one and only situation Xen wants to handle is
> that an interrupt gets delivered during the hypercall.  The hypervisor
> is too clever for its own good and deals with this by rewinding RIP to
> the beginning of whatever instruction did the hypercall and delivers
> the interrupt, and we end up in this handler.  So, if this happens,
> the idea is to not only handle the interrupt but to schedule if
> scheduling would be useful.
>
> So I don't think we need all this RCU magic.  This really ought to be
> able to be simplified to:
>
> idtentry_exit();
>
> if (appropriate condition)
>   schedule();

This is exactly the kind of tinkering which causes all kinds of trouble.

idtentry_exit()

if (user_mode(regs)) {
prepare_exit_to_usermode(regs);
} else if (regs->flags & X86_EFLAGS_IF) {
/* Check kernel preemption, if enabled */
if (IS_ENABLED(CONFIG_PREEMPTION)) {

}
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();
rcu_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
} else {
/* IRQ flags state is correct already. Just tell RCU */
rcu_irq_exit();
}

So in case IF is set then this already told the tracer and lockdep that
interrupts are enabled. And contrary to the ugly version this exit path
does not use rcu_irq_exit_preempt() which is there to warn about crappy
RCU state when trying to schedule.

So we went great length to sanitize _all_ of this and make it consistent
just to say: screw it for that xen thingy.

The extra checks and extra warnings for scheduling come with the
guarantee to bitrot when idtentry_exit() or any logic invoked from there
is changed. It's going to look like this:

/*
 * If the below causes problems due to inconsistent state
 * or out of sync sanity checks, please complain to
 * l...@kernel.org directly.
 */
idtentry_exit();

if (user_mode(regs) || !(regs->flags & X86_FlAGS_IF))
return;

if (!__this_cpu_read(xen_in_preemptible_hcall))
return;

rcu_sanity_check_for_preemption();

if (need_resched()) {
instrumentation_begin();
xen_maybe_preempt_hcall();
trace_hardirqs_on();
instrumentation_end();
}   

Of course you need the extra rcu_sanity_check_for_preemption() function
just for this muck.

That's a true win on all ends? I don't think so.

Thanks,

tglx


Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-20 Thread Andrew Cooper
On 20/05/2020 09:06, Jürgen Groß wrote:
> On 19.05.20 21:44, Andy Lutomirski wrote:
>> On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner 
>> wrote:
>>>
>>> Andy Lutomirski  writes:
 B: Turn this thing around.  Specifically, in the one and only case we
 care about, we know pretty much exactly what context we got this entry
 in: we're running in a schedulable context doing an explicitly
 preemptible hypercall, and we have RIP pointing at a SYSCALL
 instruction (presumably, but we shouldn't bet on it) in the hypercall
 page.  Ideally we would change the Xen PV ABI so the hypercall would
 return something like EAGAIN instead of auto-restarting and we could
 ditch this mess entirely.  But the ABI seems to be set in stone or at
 least in molasses, so how about just:

 idt_entry(exit(regs));
 if (inhcall && need_resched())
    schedule();
>>>
>>> Which brings you into the situation that you call schedule() from the
>>> point where we just moved it out. If we would go there we'd need to
>>> ensure that RCU is watching as well. idtentry_exit() might have it
>>> turned off 
>>
>> I don't think this is possible.  Once you untangle all the wrappers,
>> the call sites are effectively:
>>
>> __this_cpu_write(xen_in_preemptible_hcall, true);
>> CALL_NOSPEC to the hypercall page
>> __this_cpu_write(xen_in_preemptible_hcall, false);
>>
>> I think IF=1 when this happens, but I won't swear to it.  RCU had
>> better be watching.
>
> Preemptible hypercalls are never done with interrupts off. To be more
> precise: they are only ever done during ioctl() processing.
>
> I can add an ASSERT() to xen_preemptible_hcall_begin() if you want.
>
>>
>> As I understand it, the one and only situation Xen wants to handle is
>> that an interrupt gets delivered during the hypercall.  The hypervisor
>> is too clever for its own good and deals with this by rewinding RIP to
>> the beginning of whatever instruction did the hypercall and delivers
>> the interrupt, and we end up in this handler.  So, if this happens,
>> the idea is to not only handle the interrupt but to schedule if
>> scheduling would be useful.
>
> Correct. More precise: the hypercalls in question can last very long
> (up to several seconds) and so they need to be interruptible. As said
> before: the interface how this is done is horrible. :-(

Forget seconds.  DOMCTL_domain_kill gets to ~14 minutes for a 2TB domain.

The reason for the existing logic is to be able voluntarily preempt.

It doesn't need to remain the way it is, but some adequate form of
pre-emption does need to stay.

~Andrew


Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-20 Thread Jürgen Groß

On 19.05.20 21:44, Andy Lutomirski wrote:

On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner  wrote:


Andy Lutomirski  writes:

On Fri, May 15, 2020 at 5:10 PM Thomas Gleixner  wrote:

@@ -573,6 +578,16 @@ static __always_inline void __idtentry_exit(struct pt_regs 
*regs)
 instrumentation_end();
 return;
 }
+   } else if (IS_ENABLED(CONFIG_XEN_PV)) {
+   if (preempt_hcall) {
+   /* See CONFIG_PREEMPTION above */
+   instrumentation_begin();
+   rcu_irq_exit_preempt();
+   xen_maybe_preempt_hcall();
+   trace_hardirqs_on();
+   instrumentation_end();
+   return;
+   }


Ew!  This shouldn't be taken as a NAK -- it's just an expression
of disgust.


I'm really not proud of it, but that was the least horrible thing I
could come up with.


Shouldn't this be:

instrumentation_begin();
if (!irq_needs_irq_stack(...))
   __blah();
else
   run_on_irqstack(__blah, NULL);
instrumentation_end();

or even:

instrumentation_begin();
run_on_irqstack_if_needed(__blah, NULL);
instrumentation_end();


Yeah. In that case the instrumentation markers are not required as they
will be inside the run() function.


** BUT ***

I think this is all arse-backwards.  This is a giant mess designed to
pretend we support preemption and to emulate normal preemption in a
non-preemptible kernel.  I propose one to two massive cleanups:

A: Just delete all of this code.  Preemptible hypercalls on
non-preempt kernels will still process interrupts but won't get
preempted.  If you want preemption, compile with preemption.


I'm happy to do so, but the XEN folks might have opinions on that :)


Indeed. :-)




B: Turn this thing around.  Specifically, in the one and only case we
care about, we know pretty much exactly what context we got this entry
in: we're running in a schedulable context doing an explicitly
preemptible hypercall, and we have RIP pointing at a SYSCALL
instruction (presumably, but we shouldn't bet on it) in the hypercall
page.  Ideally we would change the Xen PV ABI so the hypercall would
return something like EAGAIN instead of auto-restarting and we could
ditch this mess entirely.  But the ABI seems to be set in stone or at
least in molasses, so how about just:

idt_entry(exit(regs));
if (inhcall && need_resched())
   schedule();


Which brings you into the situation that you call schedule() from the
point where we just moved it out. If we would go there we'd need to
ensure that RCU is watching as well. idtentry_exit() might have it
turned off 


I don't think this is possible.  Once you untangle all the wrappers,
the call sites are effectively:

__this_cpu_write(xen_in_preemptible_hcall, true);
CALL_NOSPEC to the hypercall page
__this_cpu_write(xen_in_preemptible_hcall, false);

I think IF=1 when this happens, but I won't swear to it.  RCU had
better be watching.


Preemptible hypercalls are never done with interrupts off. To be more
precise: they are only ever done during ioctl() processing.

I can add an ASSERT() to xen_preemptible_hcall_begin() if you want.



As I understand it, the one and only situation Xen wants to handle is
that an interrupt gets delivered during the hypercall.  The hypervisor
is too clever for its own good and deals with this by rewinding RIP to
the beginning of whatever instruction did the hypercall and delivers
the interrupt, and we end up in this handler.  So, if this happens,
the idea is to not only handle the interrupt but to schedule if
scheduling would be useful.


Correct. More precise: the hypercalls in question can last very long
(up to several seconds) and so they need to be interruptible. As said
before: the interface how this is done is horrible. :-(



So I don't think we need all this RCU magic.  This really ought to be
able to be simplified to:

idtentry_exit();

if (appropriate condition)
   schedule();

Obviously we don't want to schedule if this is a nested entry, but we
should be able to rule that out by checking that regs->flags &
X86_EFLAGS_IF and by handling the percpu variable a little more
intelligently.  So maybe the right approach is:

bool in_preemptible_hcall = __this_cpu_read(xen_in_preemptible_hcall);
__this_cpu_write(xen_in_preemptible_hcall, false);
idtentry_enter(...);

do the acutal work;

idtentry_exit(...);

if (in_preemptible_hcall) {
   assert regs->flags & X86_EFLAGS_IF;
   assert that RCU is watching;
   assert that we're on the thread stack;
   assert whatever else we feel like asserting;
   if (need_resched())
 schedule();
}

__this_cpu_write(xen_in_preemptible_hcall, in_preemptible_hcall);

And now we don't have a special idtentry_exit() case just for Xen, and
all the mess is entirely contained in the 

Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-19 Thread Andy Lutomirski
On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner  wrote:
>
> Andy Lutomirski  writes:
> > On Fri, May 15, 2020 at 5:10 PM Thomas Gleixner  wrote:
> >> @@ -573,6 +578,16 @@ static __always_inline void __idtentry_exit(struct 
> >> pt_regs *regs)
> >> instrumentation_end();
> >> return;
> >> }
> >> +   } else if (IS_ENABLED(CONFIG_XEN_PV)) {
> >> +   if (preempt_hcall) {
> >> +   /* See CONFIG_PREEMPTION above */
> >> +   instrumentation_begin();
> >> +   rcu_irq_exit_preempt();
> >> +   xen_maybe_preempt_hcall();
> >> +   trace_hardirqs_on();
> >> +   instrumentation_end();
> >> +   return;
> >> +   }
> >
> > Ew!  This shouldn't be taken as a NAK -- it's just an expression
> > of disgust.
>
> I'm really not proud of it, but that was the least horrible thing I
> could come up with.
>
> > Shouldn't this be:
> >
> > instrumentation_begin();
> > if (!irq_needs_irq_stack(...))
> >   __blah();
> > else
> >   run_on_irqstack(__blah, NULL);
> > instrumentation_end();
> >
> > or even:
> >
> > instrumentation_begin();
> > run_on_irqstack_if_needed(__blah, NULL);
> > instrumentation_end();
>
> Yeah. In that case the instrumentation markers are not required as they
> will be inside the run() function.
>
> > ** BUT ***
> >
> > I think this is all arse-backwards.  This is a giant mess designed to
> > pretend we support preemption and to emulate normal preemption in a
> > non-preemptible kernel.  I propose one to two massive cleanups:
> >
> > A: Just delete all of this code.  Preemptible hypercalls on
> > non-preempt kernels will still process interrupts but won't get
> > preempted.  If you want preemption, compile with preemption.
>
> I'm happy to do so, but the XEN folks might have opinions on that :)
>
> > B: Turn this thing around.  Specifically, in the one and only case we
> > care about, we know pretty much exactly what context we got this entry
> > in: we're running in a schedulable context doing an explicitly
> > preemptible hypercall, and we have RIP pointing at a SYSCALL
> > instruction (presumably, but we shouldn't bet on it) in the hypercall
> > page.  Ideally we would change the Xen PV ABI so the hypercall would
> > return something like EAGAIN instead of auto-restarting and we could
> > ditch this mess entirely.  But the ABI seems to be set in stone or at
> > least in molasses, so how about just:
> >
> > idt_entry(exit(regs));
> > if (inhcall && need_resched())
> >   schedule();
>
> Which brings you into the situation that you call schedule() from the
> point where we just moved it out. If we would go there we'd need to
> ensure that RCU is watching as well. idtentry_exit() might have it
> turned off 

I don't think this is possible.  Once you untangle all the wrappers,
the call sites are effectively:

__this_cpu_write(xen_in_preemptible_hcall, true);
CALL_NOSPEC to the hypercall page
__this_cpu_write(xen_in_preemptible_hcall, false);

I think IF=1 when this happens, but I won't swear to it.  RCU had
better be watching.

As I understand it, the one and only situation Xen wants to handle is
that an interrupt gets delivered during the hypercall.  The hypervisor
is too clever for its own good and deals with this by rewinding RIP to
the beginning of whatever instruction did the hypercall and delivers
the interrupt, and we end up in this handler.  So, if this happens,
the idea is to not only handle the interrupt but to schedule if
scheduling would be useful.

So I don't think we need all this RCU magic.  This really ought to be
able to be simplified to:

idtentry_exit();

if (appropriate condition)
  schedule();

Obviously we don't want to schedule if this is a nested entry, but we
should be able to rule that out by checking that regs->flags &
X86_EFLAGS_IF and by handling the percpu variable a little more
intelligently.  So maybe the right approach is:

bool in_preemptible_hcall = __this_cpu_read(xen_in_preemptible_hcall);
__this_cpu_write(xen_in_preemptible_hcall, false);
idtentry_enter(...);

do the acutal work;

idtentry_exit(...);

if (in_preemptible_hcall) {
  assert regs->flags & X86_EFLAGS_IF;
  assert that RCU is watching;
  assert that we're on the thread stack;
  assert whatever else we feel like asserting;
  if (need_resched())
schedule();
}

__this_cpu_write(xen_in_preemptible_hcall, in_preemptible_hcall);

And now we don't have a special idtentry_exit() case just for Xen, and
all the mess is entirely contained in the Xen PV code.  And we need to
mark all the preemptible hypercalls noinstr.  Does this seem
reasonable?

That being said, right now, with or without your patch, I think we're
toast if the preemptible hypercall code gets traced.  So 

Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-19 Thread Thomas Gleixner
Andy Lutomirski  writes:
> On Fri, May 15, 2020 at 5:10 PM Thomas Gleixner  wrote:
>> @@ -573,6 +578,16 @@ static __always_inline void __idtentry_exit(struct 
>> pt_regs *regs)
>> instrumentation_end();
>> return;
>> }
>> +   } else if (IS_ENABLED(CONFIG_XEN_PV)) {
>> +   if (preempt_hcall) {
>> +   /* See CONFIG_PREEMPTION above */
>> +   instrumentation_begin();
>> +   rcu_irq_exit_preempt();
>> +   xen_maybe_preempt_hcall();
>> +   trace_hardirqs_on();
>> +   instrumentation_end();
>> +   return;
>> +   }
>
> Ew!  This shouldn't be taken as a NAK -- it's just an expression
> of disgust.

I'm really not proud of it, but that was the least horrible thing I
could come up with.

> Shouldn't this be:
>
> instrumentation_begin();
> if (!irq_needs_irq_stack(...))
>   __blah();
> else
>   run_on_irqstack(__blah, NULL);
> instrumentation_end();
>
> or even:
>
> instrumentation_begin();
> run_on_irqstack_if_needed(__blah, NULL);
> instrumentation_end();

Yeah. In that case the instrumentation markers are not required as they
will be inside the run() function.

> ** BUT ***
>
> I think this is all arse-backwards.  This is a giant mess designed to
> pretend we support preemption and to emulate normal preemption in a
> non-preemptible kernel.  I propose one to two massive cleanups:
>
> A: Just delete all of this code.  Preemptible hypercalls on
> non-preempt kernels will still process interrupts but won't get
> preempted.  If you want preemption, compile with preemption.

I'm happy to do so, but the XEN folks might have opinions on that :)

> B: Turn this thing around.  Specifically, in the one and only case we
> care about, we know pretty much exactly what context we got this entry
> in: we're running in a schedulable context doing an explicitly
> preemptible hypercall, and we have RIP pointing at a SYSCALL
> instruction (presumably, but we shouldn't bet on it) in the hypercall
> page.  Ideally we would change the Xen PV ABI so the hypercall would
> return something like EAGAIN instead of auto-restarting and we could
> ditch this mess entirely.  But the ABI seems to be set in stone or at
> least in molasses, so how about just:
>
> idt_entry(exit(regs));
> if (inhcall && need_resched())
>   schedule();

Which brings you into the situation that you call schedule() from the
point where we just moved it out. If we would go there we'd need to
ensure that RCU is watching as well. idtentry_exit() might have it
turned off 

That's why I did it this way to keep the code flow exactly the same for
all these exit variants.

Thanks,

tglx


Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-19 Thread Andy Lutomirski
On Fri, May 15, 2020 at 5:10 PM Thomas Gleixner  wrote:
>
>
> Convert the XEN/PV hypercall to IDTENTRY:
>
>   - Emit the ASM stub with DECLARE_IDTENTRY
>   - Remove the ASM idtentry in 64bit
>   - Remove the open coded ASM entry code in 32bit
>   - Remove the old prototypes
>
> The handler stubs need to stay in ASM code as it needs corner case handling
> and adjustment of the stack pointer.
>
> Provide a new C function which invokes the entry/exit handling and calls
> into the XEN handler on the interrupt stack.
>
> The exit code is slightly different from the regular idtentry_exit() on
> non-preemptible kernels. If the hypercall is preemptible and need_resched()
> is set then XEN provides a preempt hypercall scheduling function. Add it as
> conditional path to __idtentry_exit() so the function can be reused.
>
> __idtentry_exit() is forced inlined so on the regular idtentry_exit() path
> the extra condition is optimized out by the compiler.
>
> Signed-off-by: Thomas Gleixner 
> Cc: Boris Ostrovsky 
> Cc: Juergen Gross 
>
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 882ada245bd5..34caf3849632 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -27,6 +27,9 @@
>  #include 
>  #include 
>
> +#include 
> +#include 
> +
>  #include 
>  #include 
>  #include 
> @@ -35,6 +38,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #define CREATE_TRACE_POINTS
>  #include 
> @@ -539,7 +543,8 @@ void noinstr idtentry_enter(struct pt_regs *regs)
> }
>  }
>
> -static __always_inline void __idtentry_exit(struct pt_regs *regs)
> +static __always_inline void __idtentry_exit(struct pt_regs *regs,
> +   bool preempt_hcall)
>  {
> lockdep_assert_irqs_disabled();
>
> @@ -573,6 +578,16 @@ static __always_inline void __idtentry_exit(struct 
> pt_regs *regs)
> instrumentation_end();
> return;
> }
> +   } else if (IS_ENABLED(CONFIG_XEN_PV)) {
> +   if (preempt_hcall) {
> +   /* See CONFIG_PREEMPTION above */
> +   instrumentation_begin();
> +   rcu_irq_exit_preempt();
> +   xen_maybe_preempt_hcall();
> +   trace_hardirqs_on();
> +   instrumentation_end();
> +   return;
> +   }

Ew!  This shouldn't be taken as a NAK -- it's just an expression of disgust.

> }
> /*
>  * If preemption is disabled then this needs to be done
> @@ -612,5 +627,43 @@ static __always_inline void __idtentry_exit(struct 
> pt_regs *regs)
>   */
>  void noinstr idtentry_exit(struct pt_regs *regs)
>  {
> -   __idtentry_exit(regs);
> +   __idtentry_exit(regs, false);
> +}
> +
> +#ifdef CONFIG_XEN_PV
> +static void __xen_pv_evtchn_do_upcall(void)
> +{
> +   irq_enter_rcu();
> +   inc_irq_stat(irq_hv_callback_count);
> +
> +   xen_hvm_evtchn_do_upcall();
> +
> +   irq_exit_rcu();
> +}
> +
> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
> +{
> +   struct pt_regs *old_regs;
> +
> +   idtentry_enter(regs);
> +   old_regs = set_irq_regs(regs);
> +
> +   if (!irq_needs_irq_stack(regs)) {
> +   instrumentation_begin();
> +   __xen_pv_evtchn_do_upcall();
> +   instrumentation_end();
> +   } else {
> +   run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL);
> +   }

Shouldn't this be:

instrumentation_begin();
if (!irq_needs_irq_stack(...))
  __blah();
else
  run_on_irqstack(__blah, NULL);
instrumentation_end();

or even:

instrumentation_begin();
run_on_irqstack_if_needed(__blah, NULL);
instrumentation_end();


** BUT ***

I think this is all arse-backwards.  This is a giant mess designed to
pretend we support preemption and to emulate normal preemption in a
non-preemptible kernel.  I propose one to two massive cleanups:

A: Just delete all of this code.  Preemptible hypercalls on
non-preempt kernels will still process interrupts but won't get
preempted.  If you want preemption, compile with preemption.

B: Turn this thing around.  Specifically, in the one and only case we
care about, we know pretty much exactly what context we got this entry
in: we're running in a schedulable context doing an explicitly
preemptible hypercall, and we have RIP pointing at a SYSCALL
instruction (presumably, but we shouldn't bet on it) in the hypercall
page.  Ideally we would change the Xen PV ABI so the hypercall would
return something like EAGAIN instead of auto-restarting and we could
ditch this mess entirely.  But the ABI seems to be set in stone or at
least in molasses, so how about just:

idt_entry(exit(regs));
if (inhcall && need_resched())
  schedule();

Off the top 

[patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

2020-05-15 Thread Thomas Gleixner


Convert the XEN/PV hypercall to IDTENTRY:

  - Emit the ASM stub with DECLARE_IDTENTRY
  - Remove the ASM idtentry in 64bit
  - Remove the open coded ASM entry code in 32bit
  - Remove the old prototypes

The handler stubs need to stay in ASM code as it needs corner case handling
and adjustment of the stack pointer.

Provide a new C function which invokes the entry/exit handling and calls
into the XEN handler on the interrupt stack.

The exit code is slightly different from the regular idtentry_exit() on
non-preemptible kernels. If the hypercall is preemptible and need_resched()
is set then XEN provides a preempt hypercall scheduling function. Add it as
conditional path to __idtentry_exit() so the function can be reused.

__idtentry_exit() is forced inlined so on the regular idtentry_exit() path
the extra condition is optimized out by the compiler.

Signed-off-by: Thomas Gleixner 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 882ada245bd5..34caf3849632 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -27,6 +27,9 @@
 #include 
 #include 
 
+#include 
+#include 
+
 #include 
 #include 
 #include 
@@ -35,6 +38,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -539,7 +543,8 @@ void noinstr idtentry_enter(struct pt_regs *regs)
}
 }
 
-static __always_inline void __idtentry_exit(struct pt_regs *regs)
+static __always_inline void __idtentry_exit(struct pt_regs *regs,
+   bool preempt_hcall)
 {
lockdep_assert_irqs_disabled();
 
@@ -573,6 +578,16 @@ static __always_inline void __idtentry_exit(struct pt_regs 
*regs)
instrumentation_end();
return;
}
+   } else if (IS_ENABLED(CONFIG_XEN_PV)) {
+   if (preempt_hcall) {
+   /* See CONFIG_PREEMPTION above */
+   instrumentation_begin();
+   rcu_irq_exit_preempt();
+   xen_maybe_preempt_hcall();
+   trace_hardirqs_on();
+   instrumentation_end();
+   return;
+   }
}
/*
 * If preemption is disabled then this needs to be done
@@ -612,5 +627,43 @@ static __always_inline void __idtentry_exit(struct pt_regs 
*regs)
  */
 void noinstr idtentry_exit(struct pt_regs *regs)
 {
-   __idtentry_exit(regs);
+   __idtentry_exit(regs, false);
+}
+
+#ifdef CONFIG_XEN_PV
+static void __xen_pv_evtchn_do_upcall(void)
+{
+   irq_enter_rcu();
+   inc_irq_stat(irq_hv_callback_count);
+
+   xen_hvm_evtchn_do_upcall();
+
+   irq_exit_rcu();
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+   struct pt_regs *old_regs;
+
+   idtentry_enter(regs);
+   old_regs = set_irq_regs(regs);
+
+   if (!irq_needs_irq_stack(regs)) {
+   instrumentation_begin();
+   __xen_pv_evtchn_do_upcall();
+   instrumentation_end();
+   } else {
+   run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL);
+   }
+
+   set_irq_regs(old_regs);
+
+   if (IS_ENABLED(CONFIG_PREEMPTION)) {
+   __idtentry_exit(regs, false);
+   } else {
+   bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+   __idtentry_exit(regs, inhcall && need_resched());
+   }
 }
+#endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7563a87d7539..6ac890d5c9d8 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1298,7 +1298,10 @@ SYM_CODE_END(native_iret)
 #endif
 
 #ifdef CONFIG_XEN_PV
-SYM_FUNC_START(xen_hypervisor_callback)
+/*
+ * See comment in entry_64.S for further explanation
+ */
+SYM_FUNC_START(exc_xen_hypervisor_callback)
/*
 * Check to see if we got the event in the critical
 * region in xen_iret_direct, after we've reenabled
@@ -1315,14 +1318,11 @@ SYM_FUNC_START(xen_hypervisor_callback)
pushl   $-1 /* orig_ax = -1 => not a system 
call */
SAVE_ALL
ENCODE_FRAME_POINTER
-   TRACE_IRQS_OFF
+
mov %esp, %eax
-   callxen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPTION
-   callxen_maybe_preempt_hcall
-#endif
-   jmp ret_from_intr
-SYM_FUNC_END(xen_hypervisor_callback)
+   callxen_pv_evtchn_do_upcall
+   jmp handle_exception_return
+SYM_FUNC_END(exc_xen_hypervisor_callback)
 
 /*
  * Hypervisor uses this for application faults while it executes.
@@ -1464,6 +1464,7 @@ SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
movl%esp, %eax  # pt_regs pointer
CALL_NOSPEC edi