diff -urN c/crypto/aes/asm/aesni-x86_64.pl d/crypto/aes/asm/aesni-x86_64.pl
--- c/crypto/aes/asm/aesni-x86_64.pl	2013-04-23 18:52:14.000000000 +0300
+++ d/crypto/aes/asm/aesni-x86_64.pl	2013-04-28 16:51:52.438125788 +0300
@@ -179,6 +179,17 @@
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+                =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+           $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+           `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+           $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+           `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+           $1>=10);
+
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 
@@ -1437,6 +1448,14 @@
 .type	aesni_xts_encrypt,\@function,6
 .align	16
 aesni_xts_encrypt:
+	mov	OPENSSL_ia32cap_P+0(%rip),%r11d
+	mov	OPENSSL_ia32cap_P+4(%rip),%r10d
+	and	\$`1<<28`,%r10d          # mask AVX bit
+	and	\$`1<<30`,%r11d          # mask "Intel CPU" bit
+	or	%r11d,%r10d
+	cmp	\$`1<<28|1<<30`,%r10d
+	je	aesni_xts_encrypt_avx
+        
 	lea	(%rsp),%rax
 	push	%rbp
 	sub	\$$frame_size,%rsp
@@ -2259,6 +2278,369 @@
 ___
 } }}
 
+
+$tw0="%xmm0";	$tw1="%xmm1";
+$tw2="%xmm10";	$tw3="%xmm11";
+$tw4="%xmm12";	$tw5="%xmm13";
+$tw6="%xmm14";	$tw7="%xmm15";
+
+sub aesround() {
+my ($i)=@_;
+$code.=<<___;
+	vmovdqu	$i*16($key), $tw7
+	vaesenc	$tw7, $inout0, $inout0
+	vaesenc	$tw7, $inout1, $inout1
+	vaesenc	$tw7, $inout2, $inout2
+	vaesenc	$tw7, $inout3, $inout3
+	vaesenc	$tw7, $inout4, $inout4
+	vaesenc	$tw7, $inout5, $inout5
+	vaesenc	$tw7, $inout6, $inout6
+	vaesenc	$tw7, $inout7, $inout7
+___
+}
+
+sub tweakInit() {
+my ($j,$i)=@_;
+$code.=<<___;
+	vpsrad	\$31, $inout0, $i
+	vpslld	\$1, $inout0, $inout0
+	vpsllq	\$1, $j, $inout1
+	vpand	.Lxts_magic(%rip), $i, $i
+	vpxor	$inout1, $i, $i
+___
+}
+
+sub nextTweak() {
+my ($j,$i)=@_;
+$code.=<<___;
+	vpsrad	\$31, $tw6, $i
+	vpslld	\$1, $tw6, $tw6
+	vpsllq	\$1, $j, $tw7
+	vpand	.Lxts_magic(%rip), $i, $i
+	vpxor	$tw7, $i, $i
+___
+}
+
+sub nextTweakStart() {
+my ($j,$i)=@_;
+$code.=<<___;
+	vpsrad	\$31, $tw6, $i
+	vpslld	\$1, $tw6, $tw6
+___
+}
+
+sub nextTweakEnd() {
+my ($j,$i)=@_;
+$code.=<<___;
+	vpsllq	\$1, $j, $tw7
+	vpand	.Lxts_magic(%rip), $i, $i
+	vpxor	$tw7, $i, $i
+___
+}
+$code.=<<___;
+
+.globl	aesni_xts_encrypt_avx
+.type	aesni_xts_encrypt_avx,\@function,5
+.align	16
+aesni_xts_encrypt_avx:
+___
+$code.=<<___ if ($win64);
+	lea	-0xa8(%rsp),%rsp
+	vmovaps	%xmm6,0x00(%rsp)
+	vmovaps	%xmm7,0x10(%rsp)
+	vmovaps	%xmm8,0x20(%rsp)
+	vmovaps	%xmm9,0x30(%rsp)
+	vmovaps	%xmm10,0x40(%rsp)
+	vmovaps	%xmm11,0x50(%rsp)
+	vmovaps	%xmm12,0x60(%rsp)
+	vmovaps	%xmm13,0x70(%rsp)
+	vmovaps	%xmm14,0x80(%rsp)
+	vmovaps	%xmm15,0x90(%rsp)
+___
+
+$code.=<<___;
+
+    vzeroall
+	push    %rbp
+	mov     %rsp, %rbp
+	sub     \$128, %rsp
+	andq    \$-16, %rsp
+	
+	vmovdqu	(%r9), $tw0
+	mov     240(%r8),$rounds		# key2->rounds
+
+	vpxor	16*0(%r8), $tw0, $tw0
+	vaesenc	16*1(%r8), $tw0, $tw0
+	vaesenc	16*2(%r8), $tw0, $tw0
+	vaesenc	16*3(%r8), $tw0, $tw0
+	vaesenc	16*4(%r8), $tw0, $tw0
+	vaesenc	16*5(%r8), $tw0, $tw0
+	vaesenc	16*6(%r8), $tw0, $tw0
+	vaesenc	16*7(%r8), $tw0, $tw0
+	vaesenc	16*8(%r8), $tw0, $tw0
+	vaesenc	16*9(%r8), $tw0, $tw0
+	vmovdqu 16*10(%r8), $tw7
+	cmp \$10, $rounds
+	jbe  .LencLastTweak
+	vaesenc	16*10(%r8), $tw0, $tw0
+	vaesenc	16*11(%r8), $tw0, $tw0
+	vmovdqu 16*12(%r8), $tw7
+	cmp \$12, $rounds
+	jbe  .LencLastTweak
+	vaesenc	16*12(%r8), $tw0, $tw0
+	vaesenc	16*13(%r8), $tw0, $tw0
+	vmovdqu	16*14(%r8), $tw7
+.LencLastTweak:
+	vaesenclast $tw7, $tw0, $tw0
+   
+	mov	240($key),$rounds		# key1->rounds
+    
+	sub	\$128, $len
+	jc	.Lxts_enc_avx_tail
+	vpshufd \$0x5f, $tw0, $inout0
+___
+	&tweakInit($tw0, $tw1);
+	&tweakInit($tw1, $tw2);
+	&tweakInit($tw2, $tw3);
+	&tweakInit($tw3, $tw4);
+	&tweakInit($tw4, $tw5);
+	&tweakInit($tw5, $tw6);
+	&tweakInit($tw6, $tw7);
+    
+$code.=<<___;
+.Lxts_enc_avx_loop:
+
+		vpxor	0*16($inp), $tw0, $inout0
+        vmovdqa $tw0, 16*0(%rsp)
+		vpxor	1*16($inp), $tw1, $inout1
+        vmovdqa $tw1, 16*1(%rsp)
+		vpxor	2*16($inp), $tw2, $inout2
+        vmovdqa $tw2, 16*2(%rsp)
+		vpxor	3*16($inp), $tw3, $inout3
+        vmovdqa $tw3, 16*3(%rsp)
+		vpxor	4*16($inp), $tw4, $inout4
+        vmovdqa $tw4, 16*4(%rsp)
+		vpxor	5*16($inp), $tw5, $inout5
+        vmovdqa $tw5, 16*5(%rsp)
+		vpxor	6*16($inp), $tw6, $inout6
+        vmovdqa $tw6, 16*6(%rsp)
+		vpxor	7*16($inp), $tw7, $inout7
+        vmovdqa $tw7, 16*7(%rsp)
+        
+		vpshufd \$0x5f, $tw7, $tw6
+___
+		&nextTweak($tw7, $tw0);
+$code.=<<___;
+		vmovdqu ($key), $tw7
+		vpxor	$tw7, $inout0, $inout0
+		vpxor	$tw7, $inout1, $inout1
+		vpxor	$tw7, $inout2, $inout2
+		vpxor	$tw7, $inout3, $inout3
+		vpxor	$tw7, $inout4, $inout4
+		vpxor	$tw7, $inout5, $inout5
+		vpxor	$tw7, $inout6, $inout6
+		vpxor	$tw7, $inout7, $inout7
+___
+		&nextTweakStart($tw0, $tw1);
+        &aesround(1);
+		&nextTweakEnd($tw0, $tw1);        
+		&nextTweakStart($tw1, $tw2);
+        &aesround(2);
+		&nextTweakEnd($tw1, $tw2);        
+		&nextTweakStart($tw2, $tw3);
+        &aesround(3);        
+		&nextTweakEnd($tw2, $tw3);        
+        &aesround(4);        
+		&nextTweakStart($tw3, $tw4);        
+        &aesround(5);        
+		&nextTweakEnd($tw3, $tw4);        
+        &aesround(6);        
+		&nextTweakStart($tw4, $tw5);        
+        &aesround(7);        
+		&nextTweakEnd($tw4, $tw5);
+        &aesround(8);
+$code.=<<___;
+		vpxor   $tw6, $tw6, $tw6
+        vpsrad  \$31, $tw5, $tw6
+        vpshufd \$0x13, $tw6, $tw6
+___
+        &aesround(9);
+$code.=<<___;
+        vpsllq  \$1, $tw5, $tw7
+        vpand   .Lxts_magic(%rip), $tw6, $tw6
+        vpxor   $tw7, $tw6, $tw6
+        
+        vmovdqu 10*16($key), $tw7
+        cmp \$10, $rounds
+        jbe  .Lxts_enc_loop_avx_last
+___
+        &aesround(10);
+        &aesround(11);
+$code.=<<___;
+        vmovdqu 12*16($key), $tw7
+        cmp \$12, $rounds
+        jbe  .Lxts_enc_loop_avx_last
+___
+        &aesround(12);
+        &aesround(13);
+$code.=<<___;
+        vmovdqu 14*16($key), $tw7
+.Lxts_enc_loop_avx_last:
+
+        vaesenclast $tw7, $inout0, $inout0
+        vpxor   16*0(%rsp), $inout0, $inout0
+        vmovdqu $inout0, 16*0($out)
+        vpxor   16*1(%rsp), $tw7, $inout0
+        vaesenclast $inout0, $inout1, $inout1
+        vpxor   16*2(%rsp), $tw7, $inout0
+        vaesenclast $inout0, $inout2, $inout2
+        vpxor   16*3(%rsp), $tw7, $inout0
+        vaesenclast $inout0, $inout3, $inout3
+        vpxor   16*4(%rsp), $tw7, $inout0
+        vaesenclast $inout0, $inout4, $inout4
+        vpxor   16*5(%rsp), $tw7, $inout0
+        vaesenclast $inout0, $inout5, $inout5
+        vpxor   16*6(%rsp), $tw7, $inout0
+        vaesenclast $inout0, $inout6, $inout6
+        vpxor   16*7(%rsp), $tw7, $inout0
+        vaesenclast $inout0, $inout7, $inout7
+        
+        vpxor   $tw7, $tw7, $tw7
+        vpsrad  \$31, $tw6, $tw7
+        vpshufd \$0x13, $tw7, $tw7
+        
+        vmovdqu $inout1, 16*1($out)
+        vmovdqu $inout2, 16*2($out)
+        vmovdqu $inout3, 16*3($out)
+        vmovdqu $inout4, 16*4($out)
+        vmovdqu $inout5, 16*5($out)
+        vmovdqu $inout6, 16*6($out)
+        vmovdqu $inout7, 16*7($out)
+
+        vpsllq  \$1, $tw6, $inout1
+        vpand   .Lxts_magic(%rip), $tw7, $tw7
+        vpxor   $inout1, $tw7, $tw7
+        
+        lea 8*16($inp), $inp
+        lea 8*16($out), $out
+        sub \$8*16, $len
+        jnc	.Lxts_enc_avx_loop
+___
+$code.=<<___;
+.Lxts_enc_avx_tail:
+
+.Lxts_enc_avx_rem:
+    add \$8*16, $len
+    je  .Lxts_enc_end
+
+    sub \$16, $len
+    jc  .Lxts_enc_avx_steal
+
+.Lxts_enc_avx_rem_loop:
+    vpxor   ($inp), $tw0, $inout0
+    vpxor   16*0($key), $inout0, $inout0
+    vaesenc 16*1($key), $inout0, $inout0
+    vaesenc 16*2($key), $inout0, $inout0
+    vaesenc 16*3($key), $inout0, $inout0
+    vaesenc 16*4($key), $inout0, $inout0
+    vaesenc 16*5($key), $inout0, $inout0
+    vaesenc 16*6($key), $inout0, $inout0
+    vaesenc 16*7($key), $inout0, $inout0
+    vaesenc 16*8($key), $inout0, $inout0
+    vaesenc 16*9($key), $inout0, $inout0
+    vmovdqu 16*10($key), $inout1
+    cmp \$10, $rounds
+    jbe  .Lxts_enc_avx_r
+    vaesenc 16*10($key), $inout0, $inout0
+    vaesenc 16*11($key), $inout0, $inout0
+    vmovdqu 16*12($key), $inout1
+    cmp \$12, $rounds
+    jbe  .Lxts_enc_avx_r
+    vaesenc 16*12($key), $inout0, $inout0
+    vaesenc 16*13($key), $inout0, $inout0
+    vmovdqu 16*14($key), $inout1
+.Lxts_enc_avx_r:
+    vaesenclast $inout1, $inout0, $inout0
+    vpxor   $tw0, $inout0, $inout0
+    vmovdqu $inout0, ($out)
+    lea 16($inp), $inp
+    lea 16($out), $out
+
+    vpxor       $tw1, $tw1, $tw1
+    vpsrad      \$31, $tw0, $tw1
+    vpshufd     \$0x13, $tw1, $tw1
+    vpsllq      \$1, $tw0, $tw0
+    vpand       .Lxts_magic(%rip), $tw1, $tw1
+    vpxor       $tw0, $tw1, $tw0
+    
+    sub \$16, $len
+    jc  .Lxts_enc_avx_steal
+    jmp .Lxts_enc_avx_rem_loop
+    
+.Lxts_enc_avx_steal:
+    add \$16, $len
+    je  .Lxts_enc_end
+    
+    xor %r11, %r11
+.Lxts_enc_steal_cpy:
+    movb -16($out, %r11), %r9b
+    movb %r9b, ($out, %r11)
+    movb ($inp, %r11), %r9b
+    movb %r9b, -16($out, %r11)
+    inc %r11
+    dec $len
+    jne .Lxts_enc_steal_cpy
+    
+    vpxor   -16($out), $tw0, $inout0
+    vpxor   16*0($key), $inout0, $inout0
+    vaesenc 16*1($key), $inout0, $inout0
+    vaesenc 16*2($key), $inout0, $inout0
+    vaesenc 16*3($key), $inout0, $inout0
+    vaesenc 16*4($key), $inout0, $inout0
+    vaesenc 16*5($key), $inout0, $inout0
+    vaesenc 16*6($key), $inout0, $inout0
+    vaesenc 16*7($key), $inout0, $inout0
+    vaesenc 16*8($key), $inout0, $inout0
+    vaesenc 16*9($key), $inout0, $inout0
+    vmovdqu 16*10($key), $inout1
+    cmp \$10, $rounds
+    jbe  .Lxts_enc_avx_s
+    vaesenc 16*10($key), $inout0, $inout0
+    vaesenc 16*11($key), $inout0, $inout0
+    vmovdqu 16*12($key), $inout1
+    cmp \$12, $rounds
+    jbe  .Lxts_enc_avx_s
+    vaesenc 16*12($key), $inout0, $inout0
+    vaesenc 16*13($key), $inout0, $inout0
+    vmovdqu 16*14($key), $inout1
+.Lxts_enc_avx_s:
+    vaesenclast $inout1, $inout0, $inout0
+    vpxor   $tw0, $inout0, $inout0
+    vmovdqu $inout0, -16($out)
+
+.Lxts_enc_end:
+	mov	%rbp, %rsp
+	pop	%rbp
+___
+$code.=<<___ if ($win64);
+	movaps	0x00(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+___
+$code.=<<___;
+	vzeroall
+.Lxts_enc_avx_ret:
+	ret
+.size	aesni_xts_encrypt_avx,.-aesni_xts_encrypt_avx
+___
 ########################################################################
 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
 #			    size_t length, const AES_KEY *key,
