Here is analysis by Paolo Bonzini:

I compared crypto/x86_64cpuid.pl and crypto/x86cpuid.pl, and the code in the
latter is wrong.

>From x86_64cpuid.pl:

        mov     %edx,%r10d              # %r9d:%r10d is copy of %ecx:%edx
        bt      \$27,%r9d               # check OSXSAVE bit
        jnc     .Lclear_avx
        xor     %ecx,%ecx               # XCR0
        .byte   0x0f,0x01,0xd0          # xgetbv
        and     \$6,%eax                # isolate XMM and YMM state support
        cmp     \$6,%eax
        je      .Ldone
.Lclear_avx:
        mov     \$0xefffe7ff,%eax       # ~(1<<28|1<<12|1<<11)
        and     %eax,%r9d               # clear AVX, FMA and AMD XOP bits
.Ldone:


>From x86cpuid.pl:

        &bt     ("ecx",26);             # check XSAVE bit
        &jnc    (&label("done"));
        &bt     ("ecx",27);             # check OSXSAVE bit
        &jnc    (&label("clear_xmm"));
        &xor    ("ecx","ecx");
        &data_byte(0x0f,0x01,0xd0);     # xgetbv
        &and    ("eax",6);
        &cmp    ("eax",6);
        &je     (&label("done"));
        &cmp    ("eax",2);
        &je     (&label("clear_avx"));
&set_label("clear_xmm");
        &and    ("ebp",0xfdfffffd);     # clear AESNI and PCLMULQDQ bits
        &and    ("esi",0xfeffffff);     # clear FXSR
&set_label("clear_avx");
        &and    ("ebp",0xefffe7ff);     # clear AVX, FMA and AMD XOP bits
&set_label("done");


x86_64cpuid.pl is not completely correct; if bit 1 of EAX was zero (XMM support
not enabled in the OS) you would need to clear AESNI and PCLMULQDQ bits as done
in x86cpuid.pl.  However, in practice does not matter because any OS new enough
to set OSXSAVE will always enable XMM support as well.

x86cpuid.pl instead is completely broken:

- the whole test is bypassed if XSAVE=1, which makes absolutely no sense. 
x86_64cpuid.pl is right in testing OSXSAVE

- if OSXSAVE=0, all SSE code is disabled, which also makes no sense because any
OS less than 10 years old lets you use SSE even if it does not set OSXSAVE (via
FXSAVE), and this includes of course RHEL6.

The attached patch (unfortunately not yet tested) synchronizes the two tests.

--- crypto/x86cpuid.pl	2011-10-26 17:13:03.599641479 +0200
+++ crypto/x86cpuid.pl	2011-10-26 17:41:04.400262001 +0200
@@ -119,20 +119,13 @@
 	&mov	("esi","edx");
 	&or	("ebp","ecx");		# merge AMD XOP flag
 
-	&bt	("ecx",26);		# check XSAVE bit
-	&jnc	(&label("done"));
 	&bt	("ecx",27);		# check OSXSAVE bit
-	&jnc	(&label("clear_xmm"));
-	&xor	("ecx","ecx");
+	&jnc	(&label("clear_avx"));
+	&xor	("ecx","ecx");          # XCR0
 	&data_byte(0x0f,0x01,0xd0);	# xgetbv
-	&and	("eax",6);
+	&and	("eax",6);              # isolate XMM and YMM state support
 	&cmp	("eax",6);
 	&je	(&label("done"));
-	&cmp	("eax",2);
-	&je	(&label("clear_avx"));
-&set_label("clear_xmm");
-	&and	("ebp",0xfdfffffd);	# clear AESNI and PCLMULQDQ bits
-	&and	("esi",0xfeffffff);	# clear FXSR
 &set_label("clear_avx");
 	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
 &set_label("done");

Reply via email to