diff -urN a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
--- a/crypto/aes/asm/aesni-x86_64.pl	2013-03-20 18:24:08.001308000 +0200
+++ b/crypto/aes/asm/aesni-x86_64.pl	2013-03-20 18:25:08.000077000 +0200
@@ -179,6 +179,17 @@
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+                =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+           $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+           `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+           $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+           `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+           $1>=10);
+
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 
@@ -1023,6 +1034,15 @@
 .align	16
 aesni_ctr32_encrypt_blocks:
 ___
+$code.=<<___ if ($avx);
+ 	mov     OPENSSL_ia32cap_P+0(%rip),%r11d
+        mov     OPENSSL_ia32cap_P+4(%rip),%r10d
+        and     \$`1<<28`,%r10d          # mask AVX bit
+        and     \$`1<<30`,%r11d          # mask "Intel CPU" bit
+        or      %r11d,%r10d
+        cmp     \$`1<<28|1<<30`,%r10d
+        je      aesni_ctr32_encrypt_blocks_avx
+___
 $code.=<<___ if ($win64);
 	lea	-0xa8(%rsp),%rsp
 	movaps	%xmm6,0x00(%rsp)
@@ -1333,6 +1353,348 @@
 	ret
 .size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
 ___
+
+}
+sub aesround() {
+my ($i)=@_;
+$code.=<<___;
+	vmovdqu	$i*16($key), $rndkey0
+	vaesenc	$rndkey0, $inout0, $inout0
+	vaesenc	$rndkey0, $inout1, $inout1
+	vaesenc	$rndkey0, $inout2, $inout2
+	vaesenc	$rndkey0, $inout3, $inout3
+	vaesenc	$rndkey0, $inout4, $inout4
+	vaesenc	$rndkey0, $inout5, $inout5
+	vaesenc	$rndkey0, $inout6, $inout6
+	vaesenc	$rndkey0, $inout7, $inout7
+___
+}
+sub aesround_7() {
+my ($i)=@_;
+$code.=<<___;
+	vmovdqu	$i*16($key), $rndkey0
+	vaesenc	$rndkey0, $inout0, $inout0
+	vaesenc	$rndkey0, $inout1, $inout1
+	vaesenc	$rndkey0, $inout2, $inout2
+	vaesenc	$rndkey0, $inout3, $inout3
+	vaesenc	$rndkey0, $inout4, $inout4
+	vaesenc	$rndkey0, $inout5, $inout5
+	vaesenc	$rndkey0, $inout6, $inout6
+___
+}
+sub nextctr() {
+my ($i)=@_;
+my $CTR="%r9d";
+my $HLP="%r11d";
+my $K3="%eax";
+$code.=<<___;
+	add	\$1, $CTR
+	mov	$CTR, $HLP
+	bswap	$HLP
+	xor	$K3, $HLP
+	mov	$HLP, 3*4+$i*16(%rsp)
+___
+}
+############################################################################
+# void aesni_ctr32_encrypt_blocks_avx (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+############################################################################
+# Copyright(c) 2012, Intel Corp.                                            
+# Developers and authors:                                                   
+# Shay Gueron (1, 2), and Vlad Krasnov (1)                                  
+# (1) Intel Architecture Group, Microprocessor and Chipset Development,     
+#     Israel Development Center, Haifa, Israel                              
+# (2) University of Haifa                                                   
+############################################################################
+# LICENSE:                                                                  
+# This submission to OpenSSL is to be made available under the OpenSSL      
+# license, and only to the OpenSSL project, in order to allow integration   
+# into the publicly distributed code.                                       
+# The use of this code, or portions of this code, or concepts embedded in   
+# this code, or modification of this code and/or algorithm(s) in it, or the 
+# use of this code for any other purpose than stated above, requires special
+# licensing.                                                                
+############################################################################
+if ($avx)
+{
+my ($in0,$in1,$in2,$in3,$one,$ivec)=map("%xmm$_",(10..15));
+my $CTR="%r9d";
+my $HLP="%r11d";
+my $K3="%eax";
+
+$code.=<<___;
+.globl	aesni_ctr32_encrypt_blocks_avx
+.type	aesni_ctr32_encrypt_blocks_avx,\@function,5
+.align	16
+aesni_ctr32_encrypt_blocks_avx:
+___
+$code.=<<___ if ($win64);
+	lea	-0xa8(%rsp),%rsp
+	movaps	%xmm6,0x00(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+	movaps	%xmm10,0x40(%rsp)
+	movaps	%xmm11,0x50(%rsp)
+	movaps	%xmm12,0x60(%rsp)
+	movaps	%xmm13,0x70(%rsp)
+	movaps	%xmm14,0x80(%rsp)
+	movaps	%xmm15,0x90(%rsp)
+___
+$code.=<<___;
+	test	$len, $len
+	jz	.Lctr32_avx_ret
+
+	mov	240($key), $rnds_
+	push	%rbp
+	mov	%rsp, %rbp
+	sub	\$128, %rsp
+	andq	\$-64, %rsp
+	
+# We observe that in counter mode, the top 96 bit of the counter block are 
+# constant under a given IV, we therefore can use them as constant, and operate
+# only on the remaining 32 bit counter, in ALU. In addition we can apply the 
+# initial xor with Key0, to those 96 bit.
+
+	mov	3*4($ivp), $CTR
+	mov	3*4($key), $K3
+	bswap	$CTR
+
+	sub	\$128, $inp
+	sub	\$128, $out
+# load the iv and xor with Key0, the first 96 bit of the result remain constant
+# for the entire message
+	vmovdqu	($key), $in0
+	vpxor	($ivp), $in0, $in0
+# store on stack, and later update only the last 32bit of the 8 counter blocks
+	vmovdqa	$in0, 0*16(%rsp)
+	vmovdqa	$in0, 1*16(%rsp)
+	vmovdqa	$in0, 2*16(%rsp)
+	vmovdqa	$in0, 3*16(%rsp)
+	vmovdqa	$in0, 4*16(%rsp)
+	vmovdqa	$in0, 5*16(%rsp)
+	vmovdqa	$in0, 6*16(%rsp)
+	vmovdqa	$in0, 7*16(%rsp)
+
+	mov	$CTR, $HLP
+	bswap	$HLP
+	xor	$K3, $HLP
+	mov	$HLP, 3*4+16*0(%rsp)
+___
+	&nextctr(1);
+	&nextctr(2);
+	&nextctr(3);
+	&nextctr(4);
+	&nextctr(5);
+	&nextctr(6);
+	&nextctr(7);
+$code.=<<___;
+	cmp	\$8, $len
+	jb	.Lctr32_avx_tail
+
+.Lctr32_avx_loop:
+
+	add	\$128, $inp
+	add	\$128, $out
+
+	vmovdqa	0*16(%rsp), $inout0
+	vmovdqa	1*16(%rsp), $inout1
+	vmovdqa	2*16(%rsp), $inout2
+	vmovdqa	3*16(%rsp), $inout3
+	vmovdqa	4*16(%rsp), $inout4
+	vmovdqa	5*16(%rsp), $inout5
+	vmovdqa	6*16(%rsp), $inout6
+	vmovdqa	7*16(%rsp), $inout7
+___
+	&aesround(1);
+	&nextctr(0);
+	&aesround(2);
+	&nextctr(1);
+	&aesround(3);
+	&nextctr(2);
+	&aesround(4);
+	&nextctr(3);
+	&aesround(5);
+	&nextctr(4);
+	&aesround(6);
+	&nextctr(5);
+	&aesround(7);
+	&nextctr(6);
+	&aesround(8);
+	&nextctr(7);
+	&aesround(9);
+$code.=<<___;
+	vmovdqu 10*16($key), $rndkey1
+
+	cmp	\$9, $rnds_
+	je	.Lctr_32_avx_enclast
+___
+	&aesround(10);
+	&aesround(11);
+$code.=<<___;
+	vmovdqu 12*16($key), $rndkey1
+
+	cmp	\$11, $rnds_
+	je	.Lctr_32_avx_enclast
+___
+	&aesround(12);
+	&aesround(13);
+$code.=<<___;
+	vmovdqu 14*16($key), $rndkey1
+
+.Lctr_32_avx_enclast:
+	vpxor	0*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout0, $inout0
+	vpxor	1*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout1, $inout1
+	vpxor	2*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout2, $inout2
+	vpxor	3*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout3, $inout3
+	vpxor	4*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout4, $inout4
+	vpxor	5*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout5, $inout5
+	vpxor	6*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout6, $inout6
+	vpxor	7*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout7, $inout7
+
+	vmovdqu	$inout0, 0*16($out)
+	vmovdqu	$inout1, 1*16($out)
+	vmovdqu	$inout2, 2*16($out)
+	vmovdqu	$inout3, 3*16($out)
+	vmovdqu	$inout4, 4*16($out)
+	vmovdqu	$inout5, 5*16($out)
+	vmovdqu	$inout6, 6*16($out)
+	vmovdqu	$inout7, 7*16($out)
+
+	sub	\$8, $len
+	cmp	\$8, $len
+	jae	.Lctr32_avx_loop
+___
+$code.=<<___;
+.Lctr32_avx_tail:
+
+	test	$len, $len
+	jz	.Lctr32_avx_end
+
+	vzeroall
+	add	\$128, $out
+	add	\$128, $inp
+
+# Because there is a high latency to the aes instruction, we don't care to
+# encrypt 7 blocks, even if eventually some will be discarded
+
+	vmovdqa	0*16(%rsp), $inout0
+	vmovdqa	1*16(%rsp), $inout1
+	vmovdqa	2*16(%rsp), $inout2
+	vmovdqa	3*16(%rsp), $inout3
+	vmovdqa	4*16(%rsp), $inout4
+	vmovdqa	5*16(%rsp), $inout5
+	vmovdqa	6*16(%rsp), $inout6
+___
+
+	&aesround_7(1);
+	&aesround_7(2);
+	&aesround_7(3);
+	&aesround_7(4);
+	&aesround_7(5);
+	&aesround_7(6);
+	&aesround_7(7);
+	&aesround_7(8);
+	&aesround_7(9);
+
+$code.=<<___;
+
+	vmovdqu 10*16($key), $rndkey1
+
+	cmp	\$9, $rnds_
+	je	.Lctr_32_avx_enclast_tail
+___
+	&aesround_7(10);
+	&aesround_7(11);	
+	
+$code.=<<___;
+
+	vmovdqu 12*16($key), $rndkey1
+
+	cmp	\$11, $rnds_
+	je	.Lctr_32_avx_enclast_tail
+___
+
+	&aesround_7(12);
+	&aesround_7(13);
+	
+$code.=<<___;
+
+	vmovdqu 14*16($key), $rndkey1
+
+.Lctr_32_avx_enclast_tail:
+
+	vpxor	0*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout0, $inout0
+	vmovdqu	$inout0, 0*16($out)
+	dec	$len
+	jz	.Lctr32_avx_end
+	vpxor	1*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout1, $inout1
+	vmovdqu	$inout1, 1*16($out)
+	dec	$len
+	jz	.Lctr32_avx_end
+	vpxor	2*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout2, $inout2
+	vmovdqu	$inout2, 2*16($out)
+	dec	$len
+	jz	.Lctr32_avx_end
+	vpxor	3*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout3, $inout3
+	vmovdqu	$inout3, 3*16($out)
+	dec	$len
+	jz	.Lctr32_avx_end
+	vpxor	4*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout4, $inout4
+	vmovdqu	$inout4, 4*16($out)
+	dec	$len
+	jz	.Lctr32_avx_end
+	vpxor	5*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout5, $inout5
+	vmovdqu	$inout5, 5*16($out)
+	dec	$len
+	jz	.Lctr32_avx_end
+	vpxor	6*16($inp), $rndkey1, $rndkey0
+	vaesenclast	$rndkey0, $inout6, $inout6
+	vmovdqu	$inout6, 6*16($out)
+
+
+.Lctr32_avx_end:
+
+	mov	%rbp, %rsp
+	pop	%rbp
+___
+$code.=<<___ if ($win64);
+	movaps	0x00(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+___
+$code.=<<___;
+	vzeroall
+.Lctr32_avx_ret:
+	ret
+.size	aesni_ctr32_encrypt_blocks_avx,.-aesni_ctr32_encrypt_blocks_avx
+___
 }
 
 ######################################################################
