>>> On 02.11.12 at 18:30, "H. Peter Anvin" <h...@zytor.com> wrote:
> Aren't we actually talking just about PV here?
> 
> If so the test is wrong.

No - this equally can affect "fully" virtualized guests (where the
CR0.TS accesses can involve VMEXIT-s).

Jan

> Jan Beulich <jbeul...@suse.com> wrote:
> 
>>In virtualized environments, the CR0.TS management needed here can be a
>>lot slower than anticipated by the original authors of this code, which
>>particularly means that in such cases forcing the use of SSE- (or MMX-)
>>based implementations is not desirable - actual measurements should
>>always be done in that case.
>>
>>For consistency, pull into the shared (32- and 64-bit) header not only
>>the inclusion of the generic code, but also that of the AVX variants.
>>
>>Signed-off-by: Jan Beulich <jbeul...@suse.com>
>>Cc: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
>>
>>---
>> arch/x86/include/asm/xor.h    |    8 +++++++-
>> arch/x86/include/asm/xor_32.h |   22 ++++++++++------------
>> arch/x86/include/asm/xor_64.h |   10 ++++++----
>> 3 files changed, 23 insertions(+), 17 deletions(-)
>>
>>--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor.h
>>+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor.h
>>@@ -487,6 +487,12 @@ static struct xor_block_template xor_blo
>> 
>> #undef XOR_CONSTANT_CONSTRAINT
>> 
>>+/* Also try the AVX routines */
>>+#include <asm/xor_avx.h>
>>+
>>+/* Also try the generic routines. */
>>+#include <asm-generic/xor.h>
>>+
>> #ifdef CONFIG_X86_32
>> # include <asm/xor_32.h>
>> #else
>>@@ -494,6 +500,6 @@ static struct xor_block_template xor_blo
>> #endif
>> 
>> #define XOR_SELECT_TEMPLATE(FASTEST) \
>>-     AVX_SELECT(FASTEST)
>>+     (cpu_has_hypervisor ? (FASTEST) : AVX_SELECT(FASTEST))
>> 
>> #endif /* _ASM_X86_XOR_H */
>>--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor_32.h
>>+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor_32.h
>>@@ -537,12 +537,6 @@ static struct xor_block_template xor_blo
>>      .do_5 = xor_sse_5,
>> };
>> 
>>-/* Also try the AVX routines */
>>-#include <asm/xor_avx.h>
>>-
>>-/* Also try the generic routines.  */
>>-#include <asm-generic/xor.h>
>>-
>>/* We force the use of the SSE xor block because it can write around
>>L2.
>>  We may also be able to load into the L1 only depending on how the cpu
>>    deals with a load to a line that is being prefetched.  */
>>@@ -553,15 +547,19 @@ do {                                                    
>>\
>>      if (cpu_has_xmm) {                              \
>>              xor_speed(&xor_block_pIII_sse);         \
>>              xor_speed(&xor_block_sse_pf64);         \
>>-     } else if (cpu_has_mmx) {                       \
>>+             if (!cpu_has_hypervisor)                \
>>+                     break;                          \
>>+     }                                               \
>>+     if (cpu_has_mmx) {                              \
>>              xor_speed(&xor_block_pII_mmx);          \
>>              xor_speed(&xor_block_p5_mmx);           \
>>-     } else {                                        \
>>-             xor_speed(&xor_block_8regs);            \
>>-             xor_speed(&xor_block_8regs_p);          \
>>-             xor_speed(&xor_block_32regs);           \
>>-             xor_speed(&xor_block_32regs_p);         \
>>+             if (!cpu_has_hypervisor)                \
>>+                     break;                          \
>>      }                                               \
>>+     xor_speed(&xor_block_8regs);                    \
>>+     xor_speed(&xor_block_8regs_p);                  \
>>+     xor_speed(&xor_block_32regs);                   \
>>+     xor_speed(&xor_block_32regs_p);                 \
>> } while (0)
>> 
>> #endif /* _ASM_X86_XOR_32_H */
>>--- 3.7-rc3-x86-xor.orig/arch/x86/include/asm/xor_64.h
>>+++ 3.7-rc3-x86-xor/arch/x86/include/asm/xor_64.h
>>@@ -9,10 +9,6 @@ static struct xor_block_template xor_blo
>>      .do_5 = xor_sse_5,
>> };
>> 
>>-
>>-/* Also try the AVX routines */
>>-#include <asm/xor_avx.h>
>>-
>>/* We force the use of the SSE xor block because it can write around
>>L2.
>>  We may also be able to load into the L1 only depending on how the cpu
>>    deals with a load to a line that is being prefetched.  */
>>@@ -22,6 +18,12 @@ do {                                               \
>>      AVX_XOR_SPEED;                          \
>>      xor_speed(&xor_block_sse_pf64);         \
>>      xor_speed(&xor_block_sse);              \
>>+     if (cpu_has_hypervisor) {               \
>>+             xor_speed(&xor_block_8regs);    \
>>+             xor_speed(&xor_block_8regs_p);  \
>>+             xor_speed(&xor_block_32regs);   \
>>+             xor_speed(&xor_block_32regs_p); \
>>+     }                                       \
>> } while (0)
>> 
>> #endif /* _ASM_X86_XOR_64_H */
> 
> -- 
> Sent from my mobile phone. Please excuse brevity and lack of formatting.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to