Re: [please test] pvclock(4): fix several bugs

2022-09-09 Thread Scott Cheloha
On Thu, Sep 08, 2022 at 10:17:12AM -0500, Scott Cheloha wrote:
> > On Sep 8, 2022, at 9:05 AM, Mike Larkin  wrote:
> > [...]
> > 
> > You could compile this and then objdump -D it and see for yourself...
> 
> I can't make heads or tails of it.  Please explain what I am looking
> at and why it is, or is not, atomic.

Or I guess I can just wing it.

Okay, so this C code:

volatile uint64_t pvclock_lastcount;

/* [...] */

void
pvclock_get_timecount(struct timecounter *tc)
{
uint64_t ctr, last;

/* [...] */

do {
last = pvclock_lastcount;
if (ctr < last)
return (last);
} while (atomic_cas_64(_lastcount, last, ctr) != last);

return (ctr);
}

... yields this amd64 disassembly (from the linked bsd binary):

81de242b:   75 2b   jne81de2458 

81de242d:   eb 01   jmp81de2430 

81de242f:   cc  int3   
81de2430:   48 8b 0d 91 fc 6c 00mov7142545(%rip),%rcx   
 # 824b20c8 
81de2437:   48 87 d1xchg   %rdx,%rcx
81de243a:   48 39 d1cmp%rdx,%rcx
81de243d:   48 87 d1xchg   %rdx,%rcx
81de2440:   72 13   jb 81de2455 

81de2442:   48 89 c8mov%rcx,%rax
81de2445:   f0 48 0f b1 15 7a fclock cmpxchg %rdx,7142522(%rip) 
   # 824b20c8 
81de244c:   6c 00 
81de244e:   48 39 c8cmp%rcx,%rax
81de2451:   75 dd   jne81de2430 

81de2453:   eb 03   jmp81de2458 

81de2455:   48 8b d1mov%rcx,%rdx
81de2458:   89 d0   mov%edx,%eax
81de245a:   48 83 c4 08 add$0x8,%rsp
81de245e:   41 5e   pop%r14
81de2460:   c9  leaveq 
81de2461:   c3  retq

... and also yields this i386 disassembly (from the pvclock.o object):

 2c7:   75 2e   jne2f7 
 2c9:   eb 05   jmp2d0 
 2cb:   cc  int3   
 2cc:   cc  int3   
 2cd:   cc  int3   
 2ce:   cc  int3   
 2cf:   cc  int3   
 2d0:   8b 15 04 00 00 00   mov0x4,%edx
 2d6:   a1 00 00 00 00  mov0x0,%eax
 2db:   87 d8   xchg   %ebx,%eax
 2dd:   39 d8   cmp%ebx,%eax
 2df:   87 d8   xchg   %ebx,%eax
 2e1:   89 f1   mov%esi,%ecx
 2e3:   19 d1   sbb%edx,%ecx
 2e5:   72 0e   jb 2f5 
 2e7:   89 f1   mov%esi,%ecx
 2e9:   f0 0f c7 0d 00 00 00lock cmpxchg8b 0x0
 2f0:   00 
 2f1:   75 dd   jne2d0 
 2f3:   eb 02   jmp2f7 
 2f5:   8b d8   mov%eax,%ebx
 2f7:   89 d8   mov%ebx,%eax
 2f9:   83 c4 1cadd$0x1c,%esp
 2fc:   5e  pop%esi
 2fd:   5f  pop%edi
 2fe:   5b  pop%ebx
 2ff:   5d  pop%ebp
 300:   c3  ret

If we isolate the pvclock_lastcount loads, on amd64 we have:

81de2430:   48 8b 0d 91 fc 6c 00mov7142545(%rip),%rcx   
 # 824b20c8 

and on i386 we have:

 2d0:   8b 15 04 00 00 00   mov0x4,%edx
 2d6:   a1 00 00 00 00  mov0x0,%eax

so the 8-byte load is atomic on amd64 (one load) and non-atomic on
i386 (two loads).

I don't know what jsg@ meant when he said the ifdefs "seemed
unnecessary", but near as I can tell they are necessary.  I need an
atomic 8-byte load and i386 can't, or won't, do it.

So I guess we go back to my original patch.

This resolves kettenis@'s atomic_cas_64() objections because we no
longer need it.

So, once again, the patch in brief:

- Add rdtsc_lfence() to i386/include/cpufunc.h

- Make pvclock_lastcount volatile uint64_t to fix the
  non-PVCLOCK_FLAG_TSC_STABLE case (see sub.).

- Check for SSE2 support in pvclock_match(), we need it for LFENCE
  in pvclock_get_timecount().

- Do RDTSC as soon as possible in the lockless read loop to get
  a better timestamp.

- Use rdtsc_lfence() instead of rdtsc() to get a better timestamp.

- Check whether our TSC lags ti->ti_tsc_timestamp so we don't
  produce a bogus delta.

- Fix the non-PVCLOCK_FLAG_TSC_STABLE case:

  + On amd64 we can do this with an atomic_cas_ulong(9) loop.  We need
to cast the pointer to (unsigned long *) or the compiler complains.
This is safe because sizeof(long) equals sizeof(uint64_t) on amd64.

  + On i386 we need a mutex, so add pvclock_mtx.  Set 

Re: [please test] pvclock(4): fix several bugs

2022-09-08 Thread Scott Cheloha
> On Sep 8, 2022, at 9:05 AM, Mike Larkin  wrote:
> 
> On Thu, Sep 08, 2022 at 08:32:27AM -0500, Scott Cheloha wrote:
>> On Tue, Sep 06, 2022 at 03:30:44AM -0700, Mike Larkin wrote:
>>> On Sun, Sep 04, 2022 at 02:50:10PM +1000, Jonathan Gray wrote:
 On Sat, Sep 03, 2022 at 05:33:01PM -0500, Scott Cheloha wrote:
> On Sat, Sep 03, 2022 at 10:37:31PM +1000, Jonathan Gray wrote:
>> On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
 On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
 
 ???On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> dv@ suggested coming to the list to request testing for the pvclock(4)
> driver.  Attached is a patch that corrects several bugs.  Most of
> these changes will only matter in the non-TSC_STABLE case on a
> multiprocessor VM.
> 
> Ideally, nothing should break.
> 
> - pvclock yields a 64-bit value.  The BSD timecounter layer can only
> use the lower 32 bits, but internally we need to track the full
> 64-bit value to allow comparisons with the full value in the
> non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
> 
> - In pvclock_get_timecount(), move rdtsc() up into the lockless read
> loop to get a more accurate timestamp.
> 
> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> 
> - In pvclock_get_timecount(), check that our TSC value doesn't predate
> ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
> 
> - In pvclock_get_timecount(), update pvclock_lastcount in the
> non-TSC_STABLE case with more care.  On amd64 we can do this with an
> atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
> to introduce a mutex to protect our comparison and read/write.
 
 i386 has cmpxchg8b, no need to disable interrupts
 the ifdefs seem excessive
>>> 
>>> How do I make use of CMPXCHG8B on i386
>>> in this context?
>>> 
>>> atomic_cas_ulong(9) is a 32-bit CAS on
>>> i386.
>> 
>> static inline uint64_t
>> atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
>> {
>>  return __sync_val_compare_and_swap(p, o, n);
>> }
>> 
>> Or md atomic.h files could have an equivalent.
>> Not possible on all 32-bit archs.
>> 
>>> 
>>> We can't use FP registers in the kernel, no?
>> 
>> What do FP registers have to do with it?
>> 
>>> 
>>> Am I missing some other avenue?
>> 
>> There is no rdtsc_lfence() on i386.  Initial diff doesn't build.
> 
> LFENCE is an SSE2 extension.  As is MFENCE.  I don't think I can just
> drop rdtsc_lfence() into cpufunc.h and proceed without causing some
> kind of fault on an older CPU.
> 
> What are my options on a 586-class CPU for forcing RDTSC to complete
> before later instructions?
 
 "3.3.2. Serializing Operations
 After executing certain instructions the Pentium processor serializes
 instruction execution. This means that any modifications to flags,
 registers, and memory for previous instructions are completed before
 the next instruction is fetched and executed. The prefetch queue
 is flushed as a result of serializing operations.
 
 The Pentium processor serializes instruction execution after executing
 one of the following instructions: Move to Special Register (except
 CRO), INVD, INVLPG, IRET, IRETD, LGDT, LLDT, LIDT, LTR, WBINVD,
 CPUID, RSM and WRMSR."
 
 from:
 Pentium Processor User's Manual
 Volume 1: Pentium Processor Data Book
 Order Number 241428
 
 http://bitsavers.org/components/intel/pentium/1993_Intel_Pentium_Processor_Users_Manual_Volume_1.pdf
 
 So it could be rdtsc ; cpuid.
 lfence; rdtsc should still be preferred.
 
 It could be tested during boot and set a function pointer.
 Or the codepatch bits could be used.
 
 In the specific case of pvclock, can it be assumed that the host
 has hardware virt and would then have lfence?
 
>>> 
>>> I think this is a fair assumption.
>> 
>> Okay, so how about we use rdtsc_lfence() in pvclock_get_timecount()
>> and, just to be safe, check for SSE2 during pvclock_match().
>> 
>> If in the future someone wants pvclock(4) support on guests without
>> SSE2 support (if any exist) we can deal with it at that time.
>> 
>> Updated patch:
>> 
>> - Add atomic_cas_64() to amd64/include/atomic.h.
>> 
>>  (NetBSD calls it atomic_cas_64(), so this is consistent with their
>>  atomic function naming scheme.)
>> 
>>  I am unsure whether it would be better to just do this:
>> 
>> #define atomic_cas_64(_p, _e, _n) _atomic_cas_ulong((unsigned long *)(_p), 
>> (_e), (_n))
>> 
>>  If I don't cast the pointer, the compiler complains, so I added
>>  

Re: [please test] pvclock(4): fix several bugs

2022-09-08 Thread Scott Cheloha
> On Sep 8, 2022, at 9:27 AM, Mark Kettenis  wrote:
> 
>> Date: Thu, 8 Sep 2022 08:32:27 -0500
>> From: Scott Cheloha 
>> 
>> On Tue, Sep 06, 2022 at 03:30:44AM -0700, Mike Larkin wrote:
>>> On Sun, Sep 04, 2022 at 02:50:10PM +1000, Jonathan Gray wrote:
 On Sat, Sep 03, 2022 at 05:33:01PM -0500, Scott Cheloha wrote:
> On Sat, Sep 03, 2022 at 10:37:31PM +1000, Jonathan Gray wrote:
>> On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
 On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
 
 ???On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> dv@ suggested coming to the list to request testing for the pvclock(4)
> driver.  Attached is a patch that corrects several bugs.  Most of
> these changes will only matter in the non-TSC_STABLE case on a
> multiprocessor VM.
> 
> Ideally, nothing should break.
> 
> - pvclock yields a 64-bit value.  The BSD timecounter layer can only
> use the lower 32 bits, but internally we need to track the full
> 64-bit value to allow comparisons with the full value in the
> non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
> 
> - In pvclock_get_timecount(), move rdtsc() up into the lockless read
> loop to get a more accurate timestamp.
> 
> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> 
> - In pvclock_get_timecount(), check that our TSC value doesn't predate
> ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
> 
> - In pvclock_get_timecount(), update pvclock_lastcount in the
> non-TSC_STABLE case with more care.  On amd64 we can do this with an
> atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
> to introduce a mutex to protect our comparison and read/write.
 
 i386 has cmpxchg8b, no need to disable interrupts
 the ifdefs seem excessive
>>> 
>>> How do I make use of CMPXCHG8B on i386
>>> in this context?
>>> 
>>> atomic_cas_ulong(9) is a 32-bit CAS on
>>> i386.
>> 
>> static inline uint64_t
>> atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
>> {
>>  return __sync_val_compare_and_swap(p, o, n);
>> }
>> 
>> Or md atomic.h files could have an equivalent.
>> Not possible on all 32-bit archs.
>> 
>>> 
>>> We can't use FP registers in the kernel, no?
>> 
>> What do FP registers have to do with it?
>> 
>>> 
>>> Am I missing some other avenue?
>> 
>> There is no rdtsc_lfence() on i386.  Initial diff doesn't build.
> 
> LFENCE is an SSE2 extension.  As is MFENCE.  I don't think I can just
> drop rdtsc_lfence() into cpufunc.h and proceed without causing some
> kind of fault on an older CPU.
> 
> What are my options on a 586-class CPU for forcing RDTSC to complete
> before later instructions?
 
 "3.3.2. Serializing Operations
 After executing certain instructions the Pentium processor serializes
 instruction execution. This means that any modifications to flags,
 registers, and memory for previous instructions are completed before
 the next instruction is fetched and executed. The prefetch queue
 is flushed as a result of serializing operations.
 
 The Pentium processor serializes instruction execution after executing
 one of the following instructions: Move to Special Register (except
 CRO), INVD, INVLPG, IRET, IRETD, LGDT, LLDT, LIDT, LTR, WBINVD,
 CPUID, RSM and WRMSR."
 
 from:
 Pentium Processor User's Manual
 Volume 1: Pentium Processor Data Book
 Order Number 241428
 
 http://bitsavers.org/components/intel/pentium/1993_Intel_Pentium_Processor_Users_Manual_Volume_1.pdf
 
 So it could be rdtsc ; cpuid.
 lfence; rdtsc should still be preferred.
 
 It could be tested during boot and set a function pointer.
 Or the codepatch bits could be used.
 
 In the specific case of pvclock, can it be assumed that the host
 has hardware virt and would then have lfence?
 
>>> 
>>> I think this is a fair assumption.
>> 
>> Okay, so how about we use rdtsc_lfence() in pvclock_get_timecount()
>> and, just to be safe, check for SSE2 during pvclock_match().
>> 
>> If in the future someone wants pvclock(4) support on guests without
>> SSE2 support (if any exist) we can deal with it at that time.
>> 
>> Updated patch:
>> 
>> - Add atomic_cas_64() to amd64/include/atomic.h.
>> 
>>  (NetBSD calls it atomic_cas_64(), so this is consistent with their
>>  atomic function naming scheme.)
>> 
>>  I am unsure whether it would be better to just do this:
>> 
>> #define atomic_cas_64(_p, _e, _n) _atomic_cas_ulong((unsigned long *)(_p), 
>> (_e), (_n))
>> 
>>  If I don't cast the pointer, the compiler complains, so I added
>>  

Re: [please test] pvclock(4): fix several bugs

2022-09-08 Thread Mark Kettenis
> Date: Thu, 8 Sep 2022 08:32:27 -0500
> From: Scott Cheloha 
> 
> On Tue, Sep 06, 2022 at 03:30:44AM -0700, Mike Larkin wrote:
> > On Sun, Sep 04, 2022 at 02:50:10PM +1000, Jonathan Gray wrote:
> > > On Sat, Sep 03, 2022 at 05:33:01PM -0500, Scott Cheloha wrote:
> > > > On Sat, Sep 03, 2022 at 10:37:31PM +1000, Jonathan Gray wrote:
> > > > > On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
> > > > > > > On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
> > > > > > >
> > > > > > > ???On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> > > > > > >> dv@ suggested coming to the list to request testing for the 
> > > > > > >> pvclock(4)
> > > > > > >> driver.  Attached is a patch that corrects several bugs.  Most of
> > > > > > >> these changes will only matter in the non-TSC_STABLE case on a
> > > > > > >> multiprocessor VM.
> > > > > > >>
> > > > > > >> Ideally, nothing should break.
> > > > > > >>
> > > > > > >> - pvclock yields a 64-bit value.  The BSD timecounter layer can 
> > > > > > >> only
> > > > > > >>  use the lower 32 bits, but internally we need to track the full
> > > > > > >>  64-bit value to allow comparisons with the full value in the
> > > > > > >>  non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit 
> > > > > > >> quantity.
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), move rdtsc() up into the lockless 
> > > > > > >> read
> > > > > > >>  loop to get a more accurate timestamp.
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), check that our TSC value doesn't 
> > > > > > >> predate
> > > > > > >>  ti->ti_tsc_timestamp, otherwise we will produce an enormous 
> > > > > > >> value.
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), update pvclock_lastcount in the
> > > > > > >>  non-TSC_STABLE case with more care.  On amd64 we can do this 
> > > > > > >> with an
> > > > > > >>  atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we 
> > > > > > >> need
> > > > > > >>  to introduce a mutex to protect our comparison and read/write.
> > > > > > >
> > > > > > > i386 has cmpxchg8b, no need to disable interrupts
> > > > > > > the ifdefs seem excessive
> > > > > >
> > > > > > How do I make use of CMPXCHG8B on i386
> > > > > > in this context?
> > > > > >
> > > > > > atomic_cas_ulong(9) is a 32-bit CAS on
> > > > > > i386.
> > > > >
> > > > > static inline uint64_t
> > > > > atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
> > > > > {
> > > > >   return __sync_val_compare_and_swap(p, o, n);
> > > > > }
> > > > >
> > > > > Or md atomic.h files could have an equivalent.
> > > > > Not possible on all 32-bit archs.
> > > > >
> > > > > >
> > > > > > We can't use FP registers in the kernel, no?
> > > > >
> > > > > What do FP registers have to do with it?
> > > > >
> > > > > >
> > > > > > Am I missing some other avenue?
> > > > >
> > > > > There is no rdtsc_lfence() on i386.  Initial diff doesn't build.
> > > >
> > > > LFENCE is an SSE2 extension.  As is MFENCE.  I don't think I can just
> > > > drop rdtsc_lfence() into cpufunc.h and proceed without causing some
> > > > kind of fault on an older CPU.
> > > >
> > > > What are my options on a 586-class CPU for forcing RDTSC to complete
> > > > before later instructions?
> > >
> > > "3.3.2. Serializing Operations
> > > After executing certain instructions the Pentium processor serializes
> > > instruction execution. This means that any modifications to flags,
> > > registers, and memory for previous instructions are completed before
> > > the next instruction is fetched and executed. The prefetch queue
> > > is flushed as a result of serializing operations.
> > >
> > > The Pentium processor serializes instruction execution after executing
> > > one of the following instructions: Move to Special Register (except
> > > CRO), INVD, INVLPG, IRET, IRETD, LGDT, LLDT, LIDT, LTR, WBINVD,
> > > CPUID, RSM and WRMSR."
> > >
> > > from:
> > > Pentium Processor User's Manual
> > > Volume 1: Pentium Processor Data Book
> > > Order Number 241428
> > >
> > > http://bitsavers.org/components/intel/pentium/1993_Intel_Pentium_Processor_Users_Manual_Volume_1.pdf
> > >
> > > So it could be rdtsc ; cpuid.
> > > lfence; rdtsc should still be preferred.
> > >
> > > It could be tested during boot and set a function pointer.
> > > Or the codepatch bits could be used.
> > >
> > > In the specific case of pvclock, can it be assumed that the host
> > > has hardware virt and would then have lfence?
> > >
> > 
> > I think this is a fair assumption.
> 
> Okay, so how about we use rdtsc_lfence() in pvclock_get_timecount()
> and, just to be safe, check for SSE2 during pvclock_match().
> 
> If in the future someone wants pvclock(4) support on guests without
> SSE2 support (if any exist) we can deal with it at that time.
> 
> Updated patch:
> 
> - Add atomic_cas_64() to amd64/include/atomic.h.
> 
>   (NetBSD calls it 

Re: [please test] pvclock(4): fix several bugs

2022-09-08 Thread Jonathan Gray
On Thu, Sep 08, 2022 at 08:32:27AM -0500, Scott Cheloha wrote:
> On Tue, Sep 06, 2022 at 03:30:44AM -0700, Mike Larkin wrote:
> > On Sun, Sep 04, 2022 at 02:50:10PM +1000, Jonathan Gray wrote:
> > > On Sat, Sep 03, 2022 at 05:33:01PM -0500, Scott Cheloha wrote:
> > > > On Sat, Sep 03, 2022 at 10:37:31PM +1000, Jonathan Gray wrote:
> > > > > On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
> > > > > > > On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
> > > > > > >
> > > > > > > ???On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> > > > > > >> dv@ suggested coming to the list to request testing for the 
> > > > > > >> pvclock(4)
> > > > > > >> driver.  Attached is a patch that corrects several bugs.  Most of
> > > > > > >> these changes will only matter in the non-TSC_STABLE case on a
> > > > > > >> multiprocessor VM.
> > > > > > >>
> > > > > > >> Ideally, nothing should break.
> > > > > > >>
> > > > > > >> - pvclock yields a 64-bit value.  The BSD timecounter layer can 
> > > > > > >> only
> > > > > > >>  use the lower 32 bits, but internally we need to track the full
> > > > > > >>  64-bit value to allow comparisons with the full value in the
> > > > > > >>  non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit 
> > > > > > >> quantity.
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), move rdtsc() up into the lockless 
> > > > > > >> read
> > > > > > >>  loop to get a more accurate timestamp.
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), check that our TSC value doesn't 
> > > > > > >> predate
> > > > > > >>  ti->ti_tsc_timestamp, otherwise we will produce an enormous 
> > > > > > >> value.
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), update pvclock_lastcount in the
> > > > > > >>  non-TSC_STABLE case with more care.  On amd64 we can do this 
> > > > > > >> with an
> > > > > > >>  atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we 
> > > > > > >> need
> > > > > > >>  to introduce a mutex to protect our comparison and read/write.
> > > > > > >
> > > > > > > i386 has cmpxchg8b, no need to disable interrupts
> > > > > > > the ifdefs seem excessive
> > > > > >
> > > > > > How do I make use of CMPXCHG8B on i386
> > > > > > in this context?
> > > > > >
> > > > > > atomic_cas_ulong(9) is a 32-bit CAS on
> > > > > > i386.
> > > > >
> > > > > static inline uint64_t
> > > > > atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
> > > > > {
> > > > >   return __sync_val_compare_and_swap(p, o, n);
> > > > > }
> > > > >
> > > > > Or md atomic.h files could have an equivalent.
> > > > > Not possible on all 32-bit archs.
> > > > >
> > > > > >
> > > > > > We can't use FP registers in the kernel, no?
> > > > >
> > > > > What do FP registers have to do with it?
> > > > >
> > > > > >
> > > > > > Am I missing some other avenue?
> > > > >
> > > > > There is no rdtsc_lfence() on i386.  Initial diff doesn't build.
> > > >
> > > > LFENCE is an SSE2 extension.  As is MFENCE.  I don't think I can just
> > > > drop rdtsc_lfence() into cpufunc.h and proceed without causing some
> > > > kind of fault on an older CPU.
> > > >
> > > > What are my options on a 586-class CPU for forcing RDTSC to complete
> > > > before later instructions?
> > >
> > > "3.3.2. Serializing Operations
> > > After executing certain instructions the Pentium processor serializes
> > > instruction execution. This means that any modifications to flags,
> > > registers, and memory for previous instructions are completed before
> > > the next instruction is fetched and executed. The prefetch queue
> > > is flushed as a result of serializing operations.
> > >
> > > The Pentium processor serializes instruction execution after executing
> > > one of the following instructions: Move to Special Register (except
> > > CRO), INVD, INVLPG, IRET, IRETD, LGDT, LLDT, LIDT, LTR, WBINVD,
> > > CPUID, RSM and WRMSR."
> > >
> > > from:
> > > Pentium Processor User's Manual
> > > Volume 1: Pentium Processor Data Book
> > > Order Number 241428
> > >
> > > http://bitsavers.org/components/intel/pentium/1993_Intel_Pentium_Processor_Users_Manual_Volume_1.pdf
> > >
> > > So it could be rdtsc ; cpuid.
> > > lfence; rdtsc should still be preferred.
> > >
> > > It could be tested during boot and set a function pointer.
> > > Or the codepatch bits could be used.
> > >
> > > In the specific case of pvclock, can it be assumed that the host
> > > has hardware virt and would then have lfence?
> > >
> > 
> > I think this is a fair assumption.
> 
> Okay, so how about we use rdtsc_lfence() in pvclock_get_timecount()
> and, just to be safe, check for SSE2 during pvclock_match().
> 
> If in the future someone wants pvclock(4) support on guests without
> SSE2 support (if any exist) we can deal with it at that time.
> 
> Updated patch:
> 
> - Add atomic_cas_64() to amd64/include/atomic.h.
> 
>   (NetBSD calls it 

Re: [please test] pvclock(4): fix several bugs

2022-09-08 Thread Mike Larkin
On Thu, Sep 08, 2022 at 08:32:27AM -0500, Scott Cheloha wrote:
> On Tue, Sep 06, 2022 at 03:30:44AM -0700, Mike Larkin wrote:
> > On Sun, Sep 04, 2022 at 02:50:10PM +1000, Jonathan Gray wrote:
> > > On Sat, Sep 03, 2022 at 05:33:01PM -0500, Scott Cheloha wrote:
> > > > On Sat, Sep 03, 2022 at 10:37:31PM +1000, Jonathan Gray wrote:
> > > > > On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
> > > > > > > On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
> > > > > > >
> > > > > > > ???On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> > > > > > >> dv@ suggested coming to the list to request testing for the 
> > > > > > >> pvclock(4)
> > > > > > >> driver.  Attached is a patch that corrects several bugs.  Most of
> > > > > > >> these changes will only matter in the non-TSC_STABLE case on a
> > > > > > >> multiprocessor VM.
> > > > > > >>
> > > > > > >> Ideally, nothing should break.
> > > > > > >>
> > > > > > >> - pvclock yields a 64-bit value.  The BSD timecounter layer can 
> > > > > > >> only
> > > > > > >>  use the lower 32 bits, but internally we need to track the full
> > > > > > >>  64-bit value to allow comparisons with the full value in the
> > > > > > >>  non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit 
> > > > > > >> quantity.
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), move rdtsc() up into the lockless 
> > > > > > >> read
> > > > > > >>  loop to get a more accurate timestamp.
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), check that our TSC value doesn't 
> > > > > > >> predate
> > > > > > >>  ti->ti_tsc_timestamp, otherwise we will produce an enormous 
> > > > > > >> value.
> > > > > > >>
> > > > > > >> - In pvclock_get_timecount(), update pvclock_lastcount in the
> > > > > > >>  non-TSC_STABLE case with more care.  On amd64 we can do this 
> > > > > > >> with an
> > > > > > >>  atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we 
> > > > > > >> need
> > > > > > >>  to introduce a mutex to protect our comparison and read/write.
> > > > > > >
> > > > > > > i386 has cmpxchg8b, no need to disable interrupts
> > > > > > > the ifdefs seem excessive
> > > > > >
> > > > > > How do I make use of CMPXCHG8B on i386
> > > > > > in this context?
> > > > > >
> > > > > > atomic_cas_ulong(9) is a 32-bit CAS on
> > > > > > i386.
> > > > >
> > > > > static inline uint64_t
> > > > > atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
> > > > > {
> > > > >   return __sync_val_compare_and_swap(p, o, n);
> > > > > }
> > > > >
> > > > > Or md atomic.h files could have an equivalent.
> > > > > Not possible on all 32-bit archs.
> > > > >
> > > > > >
> > > > > > We can't use FP registers in the kernel, no?
> > > > >
> > > > > What do FP registers have to do with it?
> > > > >
> > > > > >
> > > > > > Am I missing some other avenue?
> > > > >
> > > > > There is no rdtsc_lfence() on i386.  Initial diff doesn't build.
> > > >
> > > > LFENCE is an SSE2 extension.  As is MFENCE.  I don't think I can just
> > > > drop rdtsc_lfence() into cpufunc.h and proceed without causing some
> > > > kind of fault on an older CPU.
> > > >
> > > > What are my options on a 586-class CPU for forcing RDTSC to complete
> > > > before later instructions?
> > >
> > > "3.3.2. Serializing Operations
> > > After executing certain instructions the Pentium processor serializes
> > > instruction execution. This means that any modifications to flags,
> > > registers, and memory for previous instructions are completed before
> > > the next instruction is fetched and executed. The prefetch queue
> > > is flushed as a result of serializing operations.
> > >
> > > The Pentium processor serializes instruction execution after executing
> > > one of the following instructions: Move to Special Register (except
> > > CRO), INVD, INVLPG, IRET, IRETD, LGDT, LLDT, LIDT, LTR, WBINVD,
> > > CPUID, RSM and WRMSR."
> > >
> > > from:
> > > Pentium Processor User's Manual
> > > Volume 1: Pentium Processor Data Book
> > > Order Number 241428
> > >
> > > http://bitsavers.org/components/intel/pentium/1993_Intel_Pentium_Processor_Users_Manual_Volume_1.pdf
> > >
> > > So it could be rdtsc ; cpuid.
> > > lfence; rdtsc should still be preferred.
> > >
> > > It could be tested during boot and set a function pointer.
> > > Or the codepatch bits could be used.
> > >
> > > In the specific case of pvclock, can it be assumed that the host
> > > has hardware virt and would then have lfence?
> > >
> >
> > I think this is a fair assumption.
>
> Okay, so how about we use rdtsc_lfence() in pvclock_get_timecount()
> and, just to be safe, check for SSE2 during pvclock_match().
>
> If in the future someone wants pvclock(4) support on guests without
> SSE2 support (if any exist) we can deal with it at that time.
>
> Updated patch:
>
> - Add atomic_cas_64() to amd64/include/atomic.h.
>
>   (NetBSD calls it 

Re: [please test] pvclock(4): fix several bugs

2022-09-08 Thread Scott Cheloha
On Tue, Sep 06, 2022 at 03:30:44AM -0700, Mike Larkin wrote:
> On Sun, Sep 04, 2022 at 02:50:10PM +1000, Jonathan Gray wrote:
> > On Sat, Sep 03, 2022 at 05:33:01PM -0500, Scott Cheloha wrote:
> > > On Sat, Sep 03, 2022 at 10:37:31PM +1000, Jonathan Gray wrote:
> > > > On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
> > > > > > On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
> > > > > >
> > > > > > ???On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> > > > > >> dv@ suggested coming to the list to request testing for the 
> > > > > >> pvclock(4)
> > > > > >> driver.  Attached is a patch that corrects several bugs.  Most of
> > > > > >> these changes will only matter in the non-TSC_STABLE case on a
> > > > > >> multiprocessor VM.
> > > > > >>
> > > > > >> Ideally, nothing should break.
> > > > > >>
> > > > > >> - pvclock yields a 64-bit value.  The BSD timecounter layer can 
> > > > > >> only
> > > > > >>  use the lower 32 bits, but internally we need to track the full
> > > > > >>  64-bit value to allow comparisons with the full value in the
> > > > > >>  non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
> > > > > >>
> > > > > >> - In pvclock_get_timecount(), move rdtsc() up into the lockless 
> > > > > >> read
> > > > > >>  loop to get a more accurate timestamp.
> > > > > >>
> > > > > >> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> > > > > >>
> > > > > >> - In pvclock_get_timecount(), check that our TSC value doesn't 
> > > > > >> predate
> > > > > >>  ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
> > > > > >>
> > > > > >> - In pvclock_get_timecount(), update pvclock_lastcount in the
> > > > > >>  non-TSC_STABLE case with more care.  On amd64 we can do this with 
> > > > > >> an
> > > > > >>  atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we 
> > > > > >> need
> > > > > >>  to introduce a mutex to protect our comparison and read/write.
> > > > > >
> > > > > > i386 has cmpxchg8b, no need to disable interrupts
> > > > > > the ifdefs seem excessive
> > > > >
> > > > > How do I make use of CMPXCHG8B on i386
> > > > > in this context?
> > > > >
> > > > > atomic_cas_ulong(9) is a 32-bit CAS on
> > > > > i386.
> > > >
> > > > static inline uint64_t
> > > > atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
> > > > {
> > > > return __sync_val_compare_and_swap(p, o, n);
> > > > }
> > > >
> > > > Or md atomic.h files could have an equivalent.
> > > > Not possible on all 32-bit archs.
> > > >
> > > > >
> > > > > We can't use FP registers in the kernel, no?
> > > >
> > > > What do FP registers have to do with it?
> > > >
> > > > >
> > > > > Am I missing some other avenue?
> > > >
> > > > There is no rdtsc_lfence() on i386.  Initial diff doesn't build.
> > >
> > > LFENCE is an SSE2 extension.  As is MFENCE.  I don't think I can just
> > > drop rdtsc_lfence() into cpufunc.h and proceed without causing some
> > > kind of fault on an older CPU.
> > >
> > > What are my options on a 586-class CPU for forcing RDTSC to complete
> > > before later instructions?
> >
> > "3.3.2. Serializing Operations
> > After executing certain instructions the Pentium processor serializes
> > instruction execution. This means that any modifications to flags,
> > registers, and memory for previous instructions are completed before
> > the next instruction is fetched and executed. The prefetch queue
> > is flushed as a result of serializing operations.
> >
> > The Pentium processor serializes instruction execution after executing
> > one of the following instructions: Move to Special Register (except
> > CRO), INVD, INVLPG, IRET, IRETD, LGDT, LLDT, LIDT, LTR, WBINVD,
> > CPUID, RSM and WRMSR."
> >
> > from:
> > Pentium Processor User's Manual
> > Volume 1: Pentium Processor Data Book
> > Order Number 241428
> >
> > http://bitsavers.org/components/intel/pentium/1993_Intel_Pentium_Processor_Users_Manual_Volume_1.pdf
> >
> > So it could be rdtsc ; cpuid.
> > lfence; rdtsc should still be preferred.
> >
> > It could be tested during boot and set a function pointer.
> > Or the codepatch bits could be used.
> >
> > In the specific case of pvclock, can it be assumed that the host
> > has hardware virt and would then have lfence?
> >
> 
> I think this is a fair assumption.

Okay, so how about we use rdtsc_lfence() in pvclock_get_timecount()
and, just to be safe, check for SSE2 during pvclock_match().

If in the future someone wants pvclock(4) support on guests without
SSE2 support (if any exist) we can deal with it at that time.

Updated patch:

- Add atomic_cas_64() to amd64/include/atomic.h.

  (NetBSD calls it atomic_cas_64(), so this is consistent with their
  atomic function naming scheme.)

  I am unsure whether it would be better to just do this:

#define atomic_cas_64(_p, _e, _n) _atomic_cas_ulong((unsigned long *)(_p), 
(_e), (_n))

  If I don't cast the pointer, the compiler complains, so I added
  _atomic_cas_ullong().

  

Re: [please test] pvclock(4): fix several bugs

2022-09-06 Thread Mike Larkin
On Sun, Sep 04, 2022 at 02:50:10PM +1000, Jonathan Gray wrote:
> On Sat, Sep 03, 2022 at 05:33:01PM -0500, Scott Cheloha wrote:
> > On Sat, Sep 03, 2022 at 10:37:31PM +1000, Jonathan Gray wrote:
> > > On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
> > > > > On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
> > > > >
> > > > > ???On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> > > > >> dv@ suggested coming to the list to request testing for the 
> > > > >> pvclock(4)
> > > > >> driver.  Attached is a patch that corrects several bugs.  Most of
> > > > >> these changes will only matter in the non-TSC_STABLE case on a
> > > > >> multiprocessor VM.
> > > > >>
> > > > >> Ideally, nothing should break.
> > > > >>
> > > > >> - pvclock yields a 64-bit value.  The BSD timecounter layer can only
> > > > >>  use the lower 32 bits, but internally we need to track the full
> > > > >>  64-bit value to allow comparisons with the full value in the
> > > > >>  non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
> > > > >>
> > > > >> - In pvclock_get_timecount(), move rdtsc() up into the lockless read
> > > > >>  loop to get a more accurate timestamp.
> > > > >>
> > > > >> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> > > > >>
> > > > >> - In pvclock_get_timecount(), check that our TSC value doesn't 
> > > > >> predate
> > > > >>  ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
> > > > >>
> > > > >> - In pvclock_get_timecount(), update pvclock_lastcount in the
> > > > >>  non-TSC_STABLE case with more care.  On amd64 we can do this with an
> > > > >>  atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
> > > > >>  to introduce a mutex to protect our comparison and read/write.
> > > > >
> > > > > i386 has cmpxchg8b, no need to disable interrupts
> > > > > the ifdefs seem excessive
> > > >
> > > > How do I make use of CMPXCHG8B on i386
> > > > in this context?
> > > >
> > > > atomic_cas_ulong(9) is a 32-bit CAS on
> > > > i386.
> > >
> > > static inline uint64_t
> > > atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
> > > {
> > >   return __sync_val_compare_and_swap(p, o, n);
> > > }
> > >
> > > Or md atomic.h files could have an equivalent.
> > > Not possible on all 32-bit archs.
> > >
> > > >
> > > > We can't use FP registers in the kernel, no?
> > >
> > > What do FP registers have to do with it?
> > >
> > > >
> > > > Am I missing some other avenue?
> > >
> > > There is no rdtsc_lfence() on i386.  Initial diff doesn't build.
> >
> > LFENCE is an SSE2 extension.  As is MFENCE.  I don't think I can just
> > drop rdtsc_lfence() into cpufunc.h and proceed without causing some
> > kind of fault on an older CPU.
> >
> > What are my options on a 586-class CPU for forcing RDTSC to complete
> > before later instructions?
>
> "3.3.2. Serializing Operations
> After executing certain instructions the Pentium processor serializes
> instruction execution. This means that any modifications to flags,
> registers, and memory for previous instructions are completed before
> the next instruction is fetched and executed. The prefetch queue
> is flushed as a result of serializing operations.
>
> The Pentium processor serializes instruction execution after executing
> one of the following instructions: Move to Special Register (except
> CRO), INVD, INVLPG, IRET, IRETD, LGDT, LLDT, LIDT, LTR, WBINVD,
> CPUID, RSM and WRMSR."
>
> from:
> Pentium Processor User's Manual
> Volume 1: Pentium Processor Data Book
> Order Number 241428
>
> http://bitsavers.org/components/intel/pentium/1993_Intel_Pentium_Processor_Users_Manual_Volume_1.pdf
>
> So it could be rdtsc ; cpuid.
> lfence; rdtsc should still be preferred.
>
> It could be tested during boot and set a function pointer.
> Or the codepatch bits could be used.
>
> In the specific case of pvclock, can it be assumed that the host
> has hardware virt and would then have lfence?
>

I think this is a fair assumption.

-ml



Re: [please test] pvclock(4): fix several bugs

2022-09-03 Thread Jonathan Gray
On Sat, Sep 03, 2022 at 05:33:01PM -0500, Scott Cheloha wrote:
> On Sat, Sep 03, 2022 at 10:37:31PM +1000, Jonathan Gray wrote:
> > On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
> > > > On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
> > > > 
> > > > ???On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> > > >> dv@ suggested coming to the list to request testing for the pvclock(4)
> > > >> driver.  Attached is a patch that corrects several bugs.  Most of
> > > >> these changes will only matter in the non-TSC_STABLE case on a
> > > >> multiprocessor VM.
> > > >> 
> > > >> Ideally, nothing should break.
> > > >> 
> > > >> - pvclock yields a 64-bit value.  The BSD timecounter layer can only
> > > >>  use the lower 32 bits, but internally we need to track the full
> > > >>  64-bit value to allow comparisons with the full value in the
> > > >>  non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
> > > >> 
> > > >> - In pvclock_get_timecount(), move rdtsc() up into the lockless read
> > > >>  loop to get a more accurate timestamp.
> > > >> 
> > > >> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> > > >> 
> > > >> - In pvclock_get_timecount(), check that our TSC value doesn't predate
> > > >>  ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
> > > >> 
> > > >> - In pvclock_get_timecount(), update pvclock_lastcount in the
> > > >>  non-TSC_STABLE case with more care.  On amd64 we can do this with an
> > > >>  atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
> > > >>  to introduce a mutex to protect our comparison and read/write.
> > > > 
> > > > i386 has cmpxchg8b, no need to disable interrupts
> > > > the ifdefs seem excessive
> > > 
> > > How do I make use of CMPXCHG8B on i386
> > > in this context?
> > > 
> > > atomic_cas_ulong(9) is a 32-bit CAS on
> > > i386.
> > 
> > static inline uint64_t
> > atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
> > {
> > return __sync_val_compare_and_swap(p, o, n);
> > }
> > 
> > Or md atomic.h files could have an equivalent.
> > Not possible on all 32-bit archs.
> > 
> > > 
> > > We can't use FP registers in the kernel, no?
> > 
> > What do FP registers have to do with it?
> > 
> > > 
> > > Am I missing some other avenue?
> > 
> > There is no rdtsc_lfence() on i386.  Initial diff doesn't build.
> 
> LFENCE is an SSE2 extension.  As is MFENCE.  I don't think I can just
> drop rdtsc_lfence() into cpufunc.h and proceed without causing some
> kind of fault on an older CPU.
> 
> What are my options on a 586-class CPU for forcing RDTSC to complete
> before later instructions?

"3.3.2. Serializing Operations
After executing certain instructions the Pentium processor serializes
instruction execution. This means that any modifications to flags,
registers, and memory for previous instructions are completed before
the next instruction is fetched and executed. The prefetch queue
is flushed as a result of serializing operations.

The Pentium processor serializes instruction execution after executing
one of the following instructions: Move to Special Register (except
CRO), INVD, INVLPG, IRET, IRETD, LGDT, LLDT, LIDT, LTR, WBINVD,
CPUID, RSM and WRMSR."

from:
Pentium Processor User's Manual
Volume 1: Pentium Processor Data Book
Order Number 241428

http://bitsavers.org/components/intel/pentium/1993_Intel_Pentium_Processor_Users_Manual_Volume_1.pdf

So it could be rdtsc ; cpuid.
lfence; rdtsc should still be preferred.

It could be tested during boot and set a function pointer.
Or the codepatch bits could be used.

In the specific case of pvclock, can it be assumed that the host
has hardware virt and would then have lfence?



Re: [please test] pvclock(4): fix several bugs

2022-09-03 Thread Scott Cheloha
On Sat, Sep 03, 2022 at 10:37:31PM +1000, Jonathan Gray wrote:
> On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
> > > On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
> > > 
> > > ???On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> > >> dv@ suggested coming to the list to request testing for the pvclock(4)
> > >> driver.  Attached is a patch that corrects several bugs.  Most of
> > >> these changes will only matter in the non-TSC_STABLE case on a
> > >> multiprocessor VM.
> > >> 
> > >> Ideally, nothing should break.
> > >> 
> > >> - pvclock yields a 64-bit value.  The BSD timecounter layer can only
> > >>  use the lower 32 bits, but internally we need to track the full
> > >>  64-bit value to allow comparisons with the full value in the
> > >>  non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
> > >> 
> > >> - In pvclock_get_timecount(), move rdtsc() up into the lockless read
> > >>  loop to get a more accurate timestamp.
> > >> 
> > >> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> > >> 
> > >> - In pvclock_get_timecount(), check that our TSC value doesn't predate
> > >>  ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
> > >> 
> > >> - In pvclock_get_timecount(), update pvclock_lastcount in the
> > >>  non-TSC_STABLE case with more care.  On amd64 we can do this with an
> > >>  atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
> > >>  to introduce a mutex to protect our comparison and read/write.
> > > 
> > > i386 has cmpxchg8b, no need to disable interrupts
> > > the ifdefs seem excessive
> > 
> > How do I make use of CMPXCHG8B on i386
> > in this context?
> > 
> > atomic_cas_ulong(9) is a 32-bit CAS on
> > i386.
> 
> static inline uint64_t
> atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
> {
>   return __sync_val_compare_and_swap(p, o, n);
> }
> 
> Or md atomic.h files could have an equivalent.
> Not possible on all 32-bit archs.
> 
> > 
> > We can't use FP registers in the kernel, no?
> 
> What do FP registers have to do with it?
> 
> > 
> > Am I missing some other avenue?
> 
> There is no rdtsc_lfence() on i386.  Initial diff doesn't build.

LFENCE is an SSE2 extension.  As is MFENCE.  I don't think I can just
drop rdtsc_lfence() into cpufunc.h and proceed without causing some
kind of fault on an older CPU.

What are my options on a 586-class CPU for forcing RDTSC to complete
before later instructions?



Re: [please test] pvclock(4): fix several bugs

2022-09-03 Thread Scott Cheloha
> On Sep 3, 2022, at 07:37, Jonathan Gray  wrote:
> 
> On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
 On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
>>> 
>>> On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
 dv@ suggested coming to the list to request testing for the pvclock(4)
 driver.  Attached is a patch that corrects several bugs.  Most of
 these changes will only matter in the non-TSC_STABLE case on a
 multiprocessor VM.
 
 Ideally, nothing should break.
 
 - pvclock yields a 64-bit value.  The BSD timecounter layer can only
 use the lower 32 bits, but internally we need to track the full
 64-bit value to allow comparisons with the full value in the
 non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
 
 - In pvclock_get_timecount(), move rdtsc() up into the lockless read
 loop to get a more accurate timestamp.
 
 - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
 
 - In pvclock_get_timecount(), check that our TSC value doesn't predate
 ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
 
 - In pvclock_get_timecount(), update pvclock_lastcount in the
 non-TSC_STABLE case with more care.  On amd64 we can do this with an
 atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
 to introduce a mutex to protect our comparison and read/write.
>>> 
>>> i386 has cmpxchg8b, no need to disable interrupts
>>> the ifdefs seem excessive
>> 
>> How do I make use of CMPXCHG8B on i386
>> in this context?
>> 
>> atomic_cas_ulong(9) is a 32-bit CAS on
>> i386.
> 
> static inline uint64_t
> atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
> {
>return __sync_val_compare_and_swap(p, o, n);
> }
> 
> Or md atomic.h files could have an equivalent.
> Not possible on all 32-bit archs.

Gotcha.

>> We can't use FP registers in the kernel, no?
> 
> What do FP registers have to do with it?

I read someplace that using FP registers was
a quick and dirty way to take advantage of
the 64-bit-aligned atomic access guarantees
of the Pentium.

>> Am I missing some other avenue?
> 
> There is no rdtsc_lfence() on i386.  Initial diff doesn't build.

I will come back with a fuller patch in a bit.



Re: [please test] pvclock(4): fix several bugs

2022-09-03 Thread Jonathan Gray
On Sat, Sep 03, 2022 at 06:52:20AM -0500, Scott Cheloha wrote:
> > On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
> > 
> > On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> >> dv@ suggested coming to the list to request testing for the pvclock(4)
> >> driver.  Attached is a patch that corrects several bugs.  Most of
> >> these changes will only matter in the non-TSC_STABLE case on a
> >> multiprocessor VM.
> >> 
> >> Ideally, nothing should break.
> >> 
> >> - pvclock yields a 64-bit value.  The BSD timecounter layer can only
> >>  use the lower 32 bits, but internally we need to track the full
> >>  64-bit value to allow comparisons with the full value in the
> >>  non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
> >> 
> >> - In pvclock_get_timecount(), move rdtsc() up into the lockless read
> >>  loop to get a more accurate timestamp.
> >> 
> >> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> >> 
> >> - In pvclock_get_timecount(), check that our TSC value doesn't predate
> >>  ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
> >> 
> >> - In pvclock_get_timecount(), update pvclock_lastcount in the
> >>  non-TSC_STABLE case with more care.  On amd64 we can do this with an
> >>  atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
> >>  to introduce a mutex to protect our comparison and read/write.
> > 
> > i386 has cmpxchg8b, no need to disable interrupts
> > the ifdefs seem excessive
> 
> How do I make use of CMPXCHG8B on i386
> in this context?
> 
> atomic_cas_ulong(9) is a 32-bit CAS on
> i386.

static inline uint64_t
atomic_cas_64(volatile uint64_t *p, uint64_t o, uint64_t n)
{
return __sync_val_compare_and_swap(p, o, n);
}

Or md atomic.h files could have an equivalent.
Not possible on all 32-bit archs.

> 
> We can't use FP registers in the kernel, no?

What do FP registers have to do with it?

> 
> Am I missing some other avenue?

There is no rdtsc_lfence() on i386.  Initial diff doesn't build.



Re: [please test] pvclock(4): fix several bugs

2022-09-03 Thread Scott Cheloha
> On Sep 3, 2022, at 02:22, Jonathan Gray  wrote:
> 
> On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
>> dv@ suggested coming to the list to request testing for the pvclock(4)
>> driver.  Attached is a patch that corrects several bugs.  Most of
>> these changes will only matter in the non-TSC_STABLE case on a
>> multiprocessor VM.
>> 
>> Ideally, nothing should break.
>> 
>> - pvclock yields a 64-bit value.  The BSD timecounter layer can only
>>  use the lower 32 bits, but internally we need to track the full
>>  64-bit value to allow comparisons with the full value in the
>>  non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
>> 
>> - In pvclock_get_timecount(), move rdtsc() up into the lockless read
>>  loop to get a more accurate timestamp.
>> 
>> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
>> 
>> - In pvclock_get_timecount(), check that our TSC value doesn't predate
>>  ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
>> 
>> - In pvclock_get_timecount(), update pvclock_lastcount in the
>>  non-TSC_STABLE case with more care.  On amd64 we can do this with an
>>  atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
>>  to introduce a mutex to protect our comparison and read/write.
> 
> i386 has cmpxchg8b, no need to disable interrupts
> the ifdefs seem excessive

How do I make use of CMPXCHG8B on i386
in this context?

atomic_cas_ulong(9) is a 32-bit CAS on
i386.

We can't use FP registers in the kernel, no?

Am I missing some other avenue?



Re: [please test] pvclock(4): fix several bugs

2022-09-03 Thread Matthieu Herrb
On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> dv@ suggested coming to the list to request testing for the pvclock(4)
> driver.  Attached is a patch that corrects several bugs.  Most of
> these changes will only matter in the non-TSC_STABLE case on a
> multiprocessor VM.
> 
> Ideally, nothing should break.
>

I've put this on an OpenBSD VM hosted on a proxmox VE 7 hypervisor. It
uses pvclock as the time source both before and after the patch :

kern.timecounter.hardware=pvclock0
kern.timecounter.choice=i8254(0) pvclock0(1500) acpihpet0(1000)
acpitimer0(1000)

OpenBSD 7.2-beta (GENERIC.MP) #300: Sat Sep  3 09:34:42 CEST 2022
matth...@zuma.herrb.net:/usr/obj/GENERIC.MP
real mem = 4278005760 (4079MB)
avail mem = 4130959360 (3939MB)
random: good seed from bootblocks
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root
bios0 at mainbus0: SMBIOS rev. 2.8 @ 0xf59a0 (10 entries)
bios0: vendor SeaBIOS version "rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org" 
date 04/01/2014
bios0: QEMU Standard PC (Q35 + ICH9, 2009)
acpi0 at bios0: ACPI 3.0
acpi0: sleep states S3 S4 S5
acpi0: tables DSDT FACP APIC SSDT HPET MCFG WAET
acpi0: wakeup devices
acpitimer0 at acpi0: 3579545 Hz, 24 bits
acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
cpu0 at mainbus0: apid 0 (boot processor)
cpu0: Common KVM processor, 2000.60 MHz, 0f-06-01
cpu0: 
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SSE3,CX16,x2APIC,HV,NXE,LONG,LAHF,MELTDOWN
cpu0: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache, 4MB 64b/line 
16-way L2 cache, 16MB 64b/line 16-way L3 cache
cpu0: smt 0, core 0, package 0
mtrr: Pentium Pro MTRR support, 8 var ranges, 88 fixed ranges
cpu0: apic clock running at 1000MHz
ioapic0 at mainbus0: apid 0 pa 0xfec0, version 11, 24 pins
acpihpet0 at acpi0: 1 Hz
acpimcfg0 at acpi0
acpimcfg0: addr 0xb000, bus 0-255
acpiprt0 at acpi0: bus 0 (PCI0)
"ACPI0006" at acpi0 not configured
acpipci0 at acpi0 PCI0: 0x0010 0x0011 0x
acpicmos0 at acpi0
"PNP0A06" at acpi0 not configured
"PNP0A06" at acpi0 not configured
"PNP0A06" at acpi0 not configured
"QEMU0002" at acpi0 not configured
"ACPI0010" at acpi0 not configured
"QEMUVGID" at acpi0 not configured
acpicpu0 at acpi0: C1(@1 halt!)
pvbus0 at mainbus0: KVM
pvclock0 at pvbus0
pci0 at mainbus0 bus 0
pchb0 at pci0 dev 0 function 0 "Intel 82G33 Host" rev 0x00
vga1 at pci0 dev 1 function 0 "VMware SVGA II" rev 0x00
wsdisplay0 at vga1 mux 1: console (80x25, vt100 emulation)
wsdisplay0: screen 1-5 added (80x25, vt100 emulation)
uhci0 at pci0 dev 26 function 0 "Intel 82801I USB" rev 0x03: apic 0 int 16
uhci1 at pci0 dev 26 function 1 "Intel 82801I USB" rev 0x03: apic 0 int 17
uhci2 at pci0 dev 26 function 2 "Intel 82801I USB" rev 0x03: apic 0 int 18
ehci0 at pci0 dev 26 function 7 "Intel 82801I USB" rev 0x03: apic 0 int 19
usb0 at ehci0: USB revision 2.0
uhub0 at usb0 configuration 1 interface 0 "Intel EHCI root hub" rev 2.00/1.00 
addr 1
azalia0 at pci0 dev 27 function 0 "Intel 82801I HD Audio" rev 0x03: msi
azalia0: no HD-Audio codecs
ppb0 at pci0 dev 28 function 0 vendor "Red Hat", unknown product 0x000c rev 
0x00: apic 0 int 16
pci1 at ppb0 bus 1
ppb1 at pci0 dev 28 function 1 vendor "Red Hat", unknown product 0x000c rev 
0x00: apic 0 int 16
pci2 at ppb1 bus 2
ppb2 at pci0 dev 28 function 2 vendor "Red Hat", unknown product 0x000c rev 
0x00: apic 0 int 16
pci3 at ppb2 bus 3
ppb3 at pci0 dev 28 function 3 vendor "Red Hat", unknown product 0x000c rev 
0x00: apic 0 int 16
pci4 at ppb3 bus 4
uhci3 at pci0 dev 29 function 0 "Intel 82801I USB" rev 0x03: apic 0 int 16
uhci4 at pci0 dev 29 function 1 "Intel 82801I USB" rev 0x03: apic 0 int 17
uhci5 at pci0 dev 29 function 2 "Intel 82801I USB" rev 0x03: apic 0 int 18
ehci1 at pci0 dev 29 function 7 "Intel 82801I USB" rev 0x03: apic 0 int 19
usb1 at ehci1: USB revision 2.0
uhub1 at usb1 configuration 1 interface 0 "Intel EHCI root hub" rev 2.00/1.00 
addr 1
ppb4 at pci0 dev 30 function 0 "Intel 82801BA Hub-to-PCI" rev 0x92
pci5 at ppb4 bus 5
ppb5 at pci5 dev 1 function 0 "Red Hat Qemu PCI-PCI" rev 0x00
pci6 at ppb5 bus 6
virtio0 at pci6 dev 3 function 0 "Qumranet Virtio Memory Balloon" rev 0x00
viomb0 at virtio0
virtio0: apic 0 int 20
virtio1 at pci6 dev 10 function 0 "Qumranet Virtio Storage" rev 0x00
vioblk0 at virtio1
scsibus1 at vioblk0: 1 targets
sd0 at scsibus1 targ 0 lun 0: 
sd0: 40960MB, 512 bytes/sector, 83886080 sectors
virtio1: msix shared
virtio2 at pci6 dev 18 function 0 "Qumranet Virtio Network" rev 0x00
vio0 at virtio2: address 82:da:9e:53:3e:6b
virtio2: msix shared
ppb6 at pci5 dev 2 function 0 "Red Hat Qemu PCI-PCI" rev 0x00
pci7 at ppb6 bus 7
ppb7 at pci5 dev 3 function 0 "Red Hat Qemu PCI-PCI" rev 0x00
pci8 at ppb7 bus 8
ppb8 at pci5 dev 4 function 0 "Red Hat Qemu PCI-PCI" rev 0x00
pci9 at ppb8 bus 9
pcib0 at pci0 dev 31 function 0 "Intel 82801IB LPC" rev 0x02
ahci0 at pci0 dev 31 function 2 "Intel 82801I AHCI" rev 0x02: msi, 

Re: [please test] pvclock(4): fix several bugs

2022-09-03 Thread Jonathan Gray
On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> dv@ suggested coming to the list to request testing for the pvclock(4)
> driver.  Attached is a patch that corrects several bugs.  Most of
> these changes will only matter in the non-TSC_STABLE case on a
> multiprocessor VM.
> 
> Ideally, nothing should break.
> 
> - pvclock yields a 64-bit value.  The BSD timecounter layer can only
>   use the lower 32 bits, but internally we need to track the full
>   64-bit value to allow comparisons with the full value in the
>   non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
> 
> - In pvclock_get_timecount(), move rdtsc() up into the lockless read
>   loop to get a more accurate timestamp.
> 
> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
> 
> - In pvclock_get_timecount(), check that our TSC value doesn't predate
>   ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
> 
> - In pvclock_get_timecount(), update pvclock_lastcount in the
>   non-TSC_STABLE case with more care.  On amd64 we can do this with an
>   atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
>   to introduce a mutex to protect our comparison and read/write.

i386 has cmpxchg8b, no need to disable interrupts
the ifdefs seem excessive

> 
> Index: pvclock.c
> ===
> RCS file: /cvs/src/sys/dev/pv/pvclock.c,v
> retrieving revision 1.8
> diff -u -p -r1.8 pvclock.c
> --- pvclock.c 5 Nov 2021 11:38:29 -   1.8
> +++ pvclock.c 2 Sep 2022 22:54:08 -
> @@ -27,6 +27,10 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#if defined(__i386__)
> +#include 
> +#endif
>  
>  #include 
>  #include 
> @@ -35,7 +39,12 @@
>  #include 
>  #include 
>  
> -uint pvclock_lastcount;
> +#if defined(__amd64__)
> +volatile u_long pvclock_lastcount;
> +#elif defined(__i386__)
> +struct mutex pvclock_mtx = MUTEX_INITIALIZER(IPL_HIGH);
> +uint64_t pvclock_lastcount;
> +#endif
>  
>  struct pvclock_softc {
>   struct devicesc_dev;
> @@ -212,7 +221,7 @@ pvclock_get_timecount(struct timecounter
>  {
>   struct pvclock_softc*sc = tc->tc_priv;
>   struct pvclock_time_info*ti;
> - uint64_t tsc_timestamp, system_time, delta, ctr;
> + uint64_t system_time, delta, ctr, tsc;
>   uint32_t version, mul_frac;
>   int8_t   shift;
>   uint8_t  flags;
> @@ -220,8 +229,12 @@ pvclock_get_timecount(struct timecounter
>   ti = sc->sc_time;
>   do {
>   version = pvclock_read_begin(ti);
> + tsc = rdtsc_lfence();
> + if (ti->ti_tsc_timestamp < tsc)
> + delta = tsc - ti->ti_tsc_timestamp;
> + else
> + delta = 0;
>   system_time = ti->ti_system_time;
> - tsc_timestamp = ti->ti_tsc_timestamp;
>   mul_frac = ti->ti_tsc_to_system_mul;
>   shift = ti->ti_tsc_shift;
>   flags = ti->ti_flags;
> @@ -231,7 +244,6 @@ pvclock_get_timecount(struct timecounter
>* The algorithm is described in
>* linux/Documentation/virtual/kvm/msr.txt
>*/
> - delta = rdtsc() - tsc_timestamp;
>   if (shift < 0)
>   delta >>= -shift;
>   else
> @@ -241,10 +253,20 @@ pvclock_get_timecount(struct timecounter
>   if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0)
>   return (ctr);
>  
> - if (ctr < pvclock_lastcount)
> - return (pvclock_lastcount);
> -
> - atomic_swap_uint(_lastcount, ctr);
> -
> +#if defined(__amd64__)
> + u_long last;
> + do {
> + last = pvclock_lastcount;
> + if (ctr < last)
> + return last;
> + } while (atomic_cas_ulong(_lastcount, last, ctr) != last);
> +#elif defined(__i386__)
> + mtx_enter(_mtx);
> + if (pvclock_lastcount < ctr)
> + pvclock_lastcount = ctr;
> + else
> + ctr = pvclock_lastcount;
> + mtx_leave(_mtx);
> +#endif
>   return (ctr);
>  }
> 
> 



Re: [please test] pvclock(4): fix several bugs

2022-09-03 Thread Mike Larkin
On Fri, Sep 02, 2022 at 06:00:25PM -0500, Scott Cheloha wrote:
> dv@ suggested coming to the list to request testing for the pvclock(4)
> driver.  Attached is a patch that corrects several bugs.  Most of
> these changes will only matter in the non-TSC_STABLE case on a
> multiprocessor VM.
>
> Ideally, nothing should break.
>
> - pvclock yields a 64-bit value.  The BSD timecounter layer can only
>   use the lower 32 bits, but internally we need to track the full
>   64-bit value to allow comparisons with the full value in the
>   non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.
>
> - In pvclock_get_timecount(), move rdtsc() up into the lockless read
>   loop to get a more accurate timestamp.
>
> - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().
>
> - In pvclock_get_timecount(), check that our TSC value doesn't predate
>   ti->ti_tsc_timestamp, otherwise we will produce an enormous value.
>
> - In pvclock_get_timecount(), update pvclock_lastcount in the
>   non-TSC_STABLE case with more care.  On amd64 we can do this with an
>   atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
>   to introduce a mutex to protect our comparison and read/write.
>

I tested on an 8 core ESXi VM, nothing broke. But it doesn't even
have pvclock as a timesource, so I'm not sure the test is meaningful or
useful.

-ml

> Index: pvclock.c
> ===
> RCS file: /cvs/src/sys/dev/pv/pvclock.c,v
> retrieving revision 1.8
> diff -u -p -r1.8 pvclock.c
> --- pvclock.c 5 Nov 2021 11:38:29 -   1.8
> +++ pvclock.c 2 Sep 2022 22:54:08 -
> @@ -27,6 +27,10 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#if defined(__i386__)
> +#include 
> +#endif
>
>  #include 
>  #include 
> @@ -35,7 +39,12 @@
>  #include 
>  #include 
>
> -uint pvclock_lastcount;
> +#if defined(__amd64__)
> +volatile u_long pvclock_lastcount;
> +#elif defined(__i386__)
> +struct mutex pvclock_mtx = MUTEX_INITIALIZER(IPL_HIGH);
> +uint64_t pvclock_lastcount;
> +#endif
>
>  struct pvclock_softc {
>   struct devicesc_dev;
> @@ -212,7 +221,7 @@ pvclock_get_timecount(struct timecounter
>  {
>   struct pvclock_softc*sc = tc->tc_priv;
>   struct pvclock_time_info*ti;
> - uint64_t tsc_timestamp, system_time, delta, ctr;
> + uint64_t system_time, delta, ctr, tsc;
>   uint32_t version, mul_frac;
>   int8_t   shift;
>   uint8_t  flags;
> @@ -220,8 +229,12 @@ pvclock_get_timecount(struct timecounter
>   ti = sc->sc_time;
>   do {
>   version = pvclock_read_begin(ti);
> + tsc = rdtsc_lfence();
> + if (ti->ti_tsc_timestamp < tsc)
> + delta = tsc - ti->ti_tsc_timestamp;
> + else
> + delta = 0;
>   system_time = ti->ti_system_time;
> - tsc_timestamp = ti->ti_tsc_timestamp;
>   mul_frac = ti->ti_tsc_to_system_mul;
>   shift = ti->ti_tsc_shift;
>   flags = ti->ti_flags;
> @@ -231,7 +244,6 @@ pvclock_get_timecount(struct timecounter
>* The algorithm is described in
>* linux/Documentation/virtual/kvm/msr.txt
>*/
> - delta = rdtsc() - tsc_timestamp;
>   if (shift < 0)
>   delta >>= -shift;
>   else
> @@ -241,10 +253,20 @@ pvclock_get_timecount(struct timecounter
>   if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0)
>   return (ctr);
>
> - if (ctr < pvclock_lastcount)
> - return (pvclock_lastcount);
> -
> - atomic_swap_uint(_lastcount, ctr);
> -
> +#if defined(__amd64__)
> + u_long last;
> + do {
> + last = pvclock_lastcount;
> + if (ctr < last)
> + return last;
> + } while (atomic_cas_ulong(_lastcount, last, ctr) != last);
> +#elif defined(__i386__)
> + mtx_enter(_mtx);
> + if (pvclock_lastcount < ctr)
> + pvclock_lastcount = ctr;
> + else
> + ctr = pvclock_lastcount;
> + mtx_leave(_mtx);
> +#endif
>   return (ctr);
>  }



[please test] pvclock(4): fix several bugs

2022-09-02 Thread Scott Cheloha
dv@ suggested coming to the list to request testing for the pvclock(4)
driver.  Attached is a patch that corrects several bugs.  Most of
these changes will only matter in the non-TSC_STABLE case on a
multiprocessor VM.

Ideally, nothing should break.

- pvclock yields a 64-bit value.  The BSD timecounter layer can only
  use the lower 32 bits, but internally we need to track the full
  64-bit value to allow comparisons with the full value in the
  non-TSC_STABLE case.  So make pvclock_lastcount a 64-bit quantity.

- In pvclock_get_timecount(), move rdtsc() up into the lockless read
  loop to get a more accurate timestamp.

- In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc().

- In pvclock_get_timecount(), check that our TSC value doesn't predate
  ti->ti_tsc_timestamp, otherwise we will produce an enormous value.

- In pvclock_get_timecount(), update pvclock_lastcount in the
  non-TSC_STABLE case with more care.  On amd64 we can do this with an
  atomic_cas_ulong(9) loop because u_long is 64 bits.  On i386 we need
  to introduce a mutex to protect our comparison and read/write.

Index: pvclock.c
===
RCS file: /cvs/src/sys/dev/pv/pvclock.c,v
retrieving revision 1.8
diff -u -p -r1.8 pvclock.c
--- pvclock.c   5 Nov 2021 11:38:29 -   1.8
+++ pvclock.c   2 Sep 2022 22:54:08 -
@@ -27,6 +27,10 @@
 #include 
 #include 
 #include 
+#include 
+#if defined(__i386__)
+#include 
+#endif
 
 #include 
 #include 
@@ -35,7 +39,12 @@
 #include 
 #include 
 
-uint pvclock_lastcount;
+#if defined(__amd64__)
+volatile u_long pvclock_lastcount;
+#elif defined(__i386__)
+struct mutex pvclock_mtx = MUTEX_INITIALIZER(IPL_HIGH);
+uint64_t pvclock_lastcount;
+#endif
 
 struct pvclock_softc {
struct devicesc_dev;
@@ -212,7 +221,7 @@ pvclock_get_timecount(struct timecounter
 {
struct pvclock_softc*sc = tc->tc_priv;
struct pvclock_time_info*ti;
-   uint64_t tsc_timestamp, system_time, delta, ctr;
+   uint64_t system_time, delta, ctr, tsc;
uint32_t version, mul_frac;
int8_t   shift;
uint8_t  flags;
@@ -220,8 +229,12 @@ pvclock_get_timecount(struct timecounter
ti = sc->sc_time;
do {
version = pvclock_read_begin(ti);
+   tsc = rdtsc_lfence();
+   if (ti->ti_tsc_timestamp < tsc)
+   delta = tsc - ti->ti_tsc_timestamp;
+   else
+   delta = 0;
system_time = ti->ti_system_time;
-   tsc_timestamp = ti->ti_tsc_timestamp;
mul_frac = ti->ti_tsc_to_system_mul;
shift = ti->ti_tsc_shift;
flags = ti->ti_flags;
@@ -231,7 +244,6 @@ pvclock_get_timecount(struct timecounter
 * The algorithm is described in
 * linux/Documentation/virtual/kvm/msr.txt
 */
-   delta = rdtsc() - tsc_timestamp;
if (shift < 0)
delta >>= -shift;
else
@@ -241,10 +253,20 @@ pvclock_get_timecount(struct timecounter
if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0)
return (ctr);
 
-   if (ctr < pvclock_lastcount)
-   return (pvclock_lastcount);
-
-   atomic_swap_uint(_lastcount, ctr);
-
+#if defined(__amd64__)
+   u_long last;
+   do {
+   last = pvclock_lastcount;
+   if (ctr < last)
+   return last;
+   } while (atomic_cas_ulong(_lastcount, last, ctr) != last);
+#elif defined(__i386__)
+   mtx_enter(_mtx);
+   if (pvclock_lastcount < ctr)
+   pvclock_lastcount = ctr;
+   else
+   ctr = pvclock_lastcount;
+   mtx_leave(_mtx);
+#endif
return (ctr);
 }