I think I have found the culprit!
Related to what I am pointing out here
- https://groups.google.com/forum/#!topic/osv-dev/-7vIPwOJsyg - I thought
about enabling some FPU exceptions which are disabled by default. Once I
changed 0x037f to 0x037e which enables invalid operand exception and ran my
test and got this exception:
[registers] fps=0.0 q=-0.0 size=N/A time=00:00:14.00 bitrate=N/A dup=0 drop=
273 speed= 28x
RIP: 0x000000000044fb28 <???+4520744>
RFL: 0x0000000000010206 CS: 0x0000000000000008 SS: 0x0000000000000010
RAX: 0x0000000000000040 RBX: 0x00001000000faa22 RCX: 0x00002000001fd960
RDX: 0x00002000001fd618
RSI: 0x000000000000000a RDI: 0x00002000001fd580 RBP: 0x00002000001fd5f0
R8: 0x0000000000000000
R9: 0x0000000000000000 R10: 0x0000000000000000 R11: 0x00000000ffffffff
R12: 0x0000000000000000
R13: 0x0000000000000000 R14: 0x0000000000000000 R15: 0x00000000ffffffff
RSP: 0x00002000001fd508
DUMMY_HANDLER for math_fault aborting.
[backtrace]
0x00000000003a3ad6 <math_fault+22>
0x00000000003a11dc <???+3805660>
0x000000000045218b <vfprintf+171>
0x0000000000454138 <vsnprintf+216>
0x00001000000bfdbe <std::__cxx11::basic_string<char, std::char_traits<char>,
std::allocator<char> > __gnu_cxx::__to_xstring<std::__cxx11::basic_string<
char, std::char_traits<char>, std::allocator<char> >, char>(int (*)(char*,
unsigned long, char const*, __va_list_tag*), unsigned long, char const*,
...)+174>
0x00001000000f1447 <httpserver::json::formatter::to_json[abi:cxx11](float)+
103>
0x00002000001fdf3f <???+2088767>
0x00002000001fdf27 <???+2088743>
0x0000100003c5a49f <???+63284383>
0x4810c0834808578a <???+1208506250>
Once I put breakpoint in dump_registers() and connected with gdd to get
stacktrace I saw this:
Breakpoint 1, dump_registers (ef=0xffff800002a79068) at arch/x64/dump.cc:17
17 osv::lookup_name_demangled(
(gdb) bt
#0 dump_registers (ef=0xffff800002a79068) at arch/x64/dump.cc:17
#1 0x00000000003a3ac9 in math_fault (ef=<optimized out>) at
arch/x64/exceptions.cc:338
#2 <signal handler called>
*#3 0x000000000044fb28 in pop_arg (arg=arg@entry=0x2000001fd580,
type=<optimized out>, ap=ap@entry=0x2000001fd618)*
* at libc/stdio/vfprintf.c:165*
#4 0x000000000045161c in pop_arg (ap=0x2000001fd618, type=<optimized out>,
arg=0x2000001fd580) at libc/stdio/vfprintf.c:143
#5 printf_core (f=f@entry=0x2000001fd7a0, fmt=fmt@entry=0x1000000faa20
"%f", ap=ap@entry=0x2000001fd618,
nl_arg=nl_arg@entry=0x2000001fd6b0, nl_type=nl_type@entry=0x2000001fd630)
at libc/stdio/vfprintf.c:538
#6 0x000000000045218c in vfprintf (f=f@entry=0x2000001fd7a0,
fmt=0x1000000faa20 "%f", ap=<optimized out>) at libc/stdio/vfprintf.c:675
#7 0x0000000000454139 in vsnprintf (s=0x2000001fd8c0 "\256", n=<optimized
out>, fmt=<optimized out>, ap=<optimized out>)
at libc/stdio/vsnprintf.c:33
#8 0x00001000000bfdbf in ?? ()
#9 0x00000000000000ae in ?? ()
#10 0x0000000000550b48 in std::__cxx11::basic_string<char,
std::char_traits<char>, std::allocator<char> >::_M_replace(unsigned long,
unsigned
#11 0x0000000000000000 in ?? ()
Here is the code around line 165 in vfprintf.c
140 static void pop_arg(union arg *arg, int type, va_list *ap)
141 {
142 /* Give the compiler a hint for optimizing the switch. */
143 if ((unsigned)type > MAXSTATE) return;
144 switch (type) {
145 case PTR: arg->p = va_arg(*ap, void *);
146 break; case INT: arg->i = va_arg(*ap, int);
147 break; case UINT: arg->i = va_arg(*ap, unsigned int);
148 #ifndef LONG_IS_INT
149 break; case LONG: arg->i = va_arg(*ap, long);
150 break; case ULONG: arg->i = va_arg(*ap, unsigned long);
151 #endif
152 break; case ULLONG: arg->i = va_arg(*ap, unsigned long
long);
153 break; case SHORT: arg->i = (short)va_arg(*ap, int);
154 break; case USHORT: arg->i = (unsigned short)va_arg(*ap,
int);
155 break; case CHAR: arg->i = (signed char)va_arg(*ap, int);
156 break; case UCHAR: arg->i = (unsigned char)va_arg(*ap,
int);
157 #ifdef ODD_TYPES
158 break; case LLONG: arg->i = va_arg(*ap, long long);
159 break; case SIZET: arg->i = va_arg(*ap, size_t);
160 break; case IMAX: arg->i = va_arg(*ap, intmax_t);
161 break; case UMAX: arg->i = va_arg(*ap, uintmax_t);
162 break; case PDIFF: arg->i = va_arg(*ap, ptrdiff_t);
163 break; case UIPTR: arg->i = (uintptr_t)va_arg(*ap, void *);
164 #endif
*165 break; case DBL: arg->f = va_arg(*ap, double);*
166 break; case LDBL: arg->f = va_arg(*ap, long double);
167 }
168 }
What could have gone wrong in the line 165 that triggered invalid operand
exception?
My theory is this:
1. Httpserver tries to print thread list using this code:
118 os_threads.set_handler([](const_req req) {
119 using namespace std::chrono;
120 httpserver::json::Threads threads;
121 threads.time_ms = duration_cast<milliseconds>
122 (osv::clock::wall::now().time_since_epoch()).count();
123 httpserver::json::Thread thread;
124 sched::with_all_threads([&](sched::thread &t) {
125 thread.id = t.id();
126 thread.status = t.get_status();
127 auto tcpu = t.tcpu();
128 thread.cpu = tcpu ? tcpu->id : -1;
129 thread.cpu_ms =
duration_cast<milliseconds>(t.thread_clock()).count();
130 thread.switches = t.stat_switches.get();
131 thread.migrations = t.stat_migrations.get();
132 thread.preemptions = t.stat_preemptions.get();
133 thread.name = t.name();
134 thread.priority = t.priority();
135 thread.stack_size = t.get_stack_info().size;
136 thread.status = t.get_status();
137 threads.list.push(thread);
138 });
139 return threads;
140 });
This code uses sched::with_all_threads() to iterate over the list of
threads which uses mutex guarding against adding new threads and destroying
existing one. However I think this does not guard any of the values grabbed
to populate httpserver::json::Thread from* being concurrently modified by
OSv scheduler* (am I wrong?). *The priority field is float which when
copied in not-thread-safe way ends up being junk which does not represent
any valid float value (invalid operand).*
2. The code downstream to serialize JSON calls vfprintf which at line 165
tries to extract the single element from va_list. This probably ends up
using FPU which rejects corrupted priority float value. Because normally
invalid operand exception is disabled vfprintf happily continues and gets
to that famous loop which probably keeps raising same exception which gets
ignored until we reach the beginning of the unmapped page and boom !!!
How do you like my theory?
I think this theory has its shortcomings - I think it explains open issues
#536 and #1010 (and possibly to soon closed #490) but I think it fails to
explain the newest one I created
- https://github.com/cloudius-systems/osv/issues/1018 (there is no
httpserver running). Maybe the latter one is due to some issues with
missing initialization of FPU control word (but I doubt it).
Waldek
On Thursday, November 29, 2018 at 6:19:45 PM UTC-5, Waldek Kozaczuk wrote:
>
> Unless I made a mistake in the code that verifies fpu state corruption
> which showed no corruption I think it might be worth to ponder possibility
> of different nature of the root cause of this and other issues where
> fmt_fp() shows up in the stack trace.
>
> What if the problem is actually caused by wrong/incompatible compiler
> options that drive what machine code is generated to handle floating point
> operation. For example I noticed that when I configured musl independently
> on my machine I saw it add following options related to floating point math:
>
> CFLAGS_C99FSE = -std=c99 -nostdinc -ffreestanding -*fexcess-precision=*
> *standard* *-frounding-**math* -Wa,--noexecstack
>
> I have not enough experience to make any conclusive observations but two
> of those -fexcess-precision and -frounding-math do affet code using float,
> double, long double etc. I did not see those in main OSv makefile so I
> wonder if musl compiled code is wrong and misbehaves.
>
> I think FPU has control word with RC fields that control rounding (
> http://www.website.masmforum.com/tutorials/fptute/fpuchap1.htm) what if
> our code setups FPU rounding that is incompatible with how we compile musl.
> Just wild theory.
>
> Adding musl math library documentation -
> https://wiki.musl-libc.org/mathematical-library.html- there are
> paragraphs that talk about rounding and precision.
>
> On Thursday, November 29, 2018 at 2:33:26 PM UTC-5, Waldek Kozaczuk wrote:
>>
>> I wonder if per this - https://software.intel.com/en-us/node/523375 - we
>> are enabling correct bits for xsave if caste is what being used here.
>>
>> “The second operand is a save/restore mask specifying the saved/restored
>> extended states. The value of the mask is ANDed with
>> XFEATURE_ENABLED_MASK(XCR0). A particular extended state is saved/restored
>> only if the corresponding bit of both save/restore mask and
>> XFEATURE_ENABLED_MASK is set to '1'.”
>
>
--
You received this message because you are subscribed to the Google Groups "OSv
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/d/optout.