Files ../master_01_09_2015/.git/index and ./.git/index differ
diff -urN ../master_01_09_2015/Configure ./Configure
--- ../master_01_09_2015/Configure	2015-09-01 11:25:05.962599731 +0300
+++ ./Configure	2015-09-07 11:43:54.702684155 +0300
@@ -338,7 +338,7 @@
     x86_64_asm => {
 	template	=> 1,
 	cpuid_obj       => "x86_64cpuid.o",
-	bn_obj          => "x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o",
+	bn_obj          => "x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o rsaz-vpmadd.o",
 	ec_obj          => "ecp_nistz256.o ecp_nistz256-x86_64.o",
 	aes_obj         => "aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o",
 	md5_obj         => "md5-x86_64.o",
diff -urN ../master_01_09_2015/Makefile ./Makefile
--- ../master_01_09_2015/Makefile	2015-09-07 18:16:46.105739718 +0300
+++ ./Makefile	2015-09-07 11:43:57.419219498 +0300
@@ -91,7 +91,7 @@
 
 # CPUID module collects small commonly used assembler snippets
 CPUID_OBJ= x86_64cpuid.o
-BN_ASM= x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
+BN_ASM= x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o rsaz-vpmadd.o
 EC_ASM= ecp_nistz256.o ecp_nistz256-x86_64.o
 DES_ENC= des_enc.o fcrypt_b.o
 AES_ENC= aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
diff -urN ../master_01_09_2015/crypto/bn/Makefile ./crypto/bn/Makefile
--- ../master_01_09_2015/crypto/bn/Makefile	2015-09-01 11:25:09.704858924 +0300
+++ ./crypto/bn/Makefile	2015-09-07 11:48:04.208658125 +0300
@@ -109,6 +109,8 @@
 	$(PERL) asm/rsaz-x86_64.pl $(PERLASM_SCHEME) > $@
 rsaz-avx2.s:	asm/rsaz-avx2.pl 
 	$(PERL) asm/rsaz-avx2.pl $(PERLASM_SCHEME) > $@
+rsaz-vpmadd.s:	asm/rsaz-vpmadd.S
+	$(CC) $(CFLAGS) -E asm/rsaz-vpmadd.S > $@
 
 bn-ia64.s:	asm/ia64.S
 	$(CC) $(CFLAGS) -E asm/ia64.S > $@
diff -urN ../master_01_09_2015/crypto/bn/asm/rsaz-vpmadd.S ./crypto/bn/asm/rsaz-vpmadd.S
--- ../master_01_09_2015/crypto/bn/asm/rsaz-vpmadd.S	1970-01-01 02:00:00.000000000 +0200
+++ ./crypto/bn/asm/rsaz-vpmadd.S	2015-09-10 08:32:05.049881491 +0300
@@ -0,0 +1,342 @@
+##############################################################################
+# Copyright 2015 Intel Corporation                                           #
+#                                                                            #
+# Licensed under the Apache License, Version 2.0 (the "License");            #
+# you may not use this file except in compliance with the License.           #
+# You may obtain a copy of the License at                                    #
+#                                                                            #
+#    http://www.apache.org/licenses/LICENSE-2.0                              #
+#                                                                            #
+# Unless required by applicable law or agreed to in writing, software        #
+# distributed under the License is distributed on an "AS IS" BASIS,          #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   #
+# See the License for the specific language governing permissions and        #
+# limitations under the License.                                             #
+#                                                                            #
+##############################################################################
+#                                                                            # 
+#  Developers and authors:                                                   # 
+#  Shay Gueron, and Nir Drucker                                              # 
+#  (1) Intel Corporation, Israel Development Center                          # 
+#  (2) University of Haifa                                                   # 
+#  Reference:                                                                # 
+#  S.Gueron and V.Krasnov, "Software Implementation of Modular Exponentiation#
+#  , Using Advanced Vector Instructions Architectures"                       #
+#  S. Gueron, V. Krasnov,                                                    #
+#  "New CPU instructions for speeding up modular exponentiation"             #
+#                                                                            #
+##############################################################################
+
+# Prototypes of functions in this document:
+# int rsaz_vpmadd52_eligible();
+# void AMM_1536_IFMA(uint64_t *rp, const uint64_t *aptr, const uint64_t *bptr, const uint64_t *nptr, uint64_t n0);
+# void rsaz_select1536_vpmadd(uint64_t *val, uint64_t *in_t, int index, int limit);
+
+.align 64
+.Land_mask:
+  .quad  0xfffffffffffff
+
+################################################################################
+.globl	rsaz_vpmadd52_eligible
+.type	rsaz_vpmadd52_eligible,@function
+.align	32
+rsaz_vpmadd52_eligible:
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+	movl	%eax, %ecx
+	
+	shrl	$21, %eax	# vpmadd52 flag
+	shrl	$17, %ecx	# avx3.1 flag
+	and		%ecx, %eax
+	andl	$1,%eax
+	ret
+.size	rsaz_vpmadd52_eligible,.-rsaz_vpmadd52_eligible
+################################################################################
+
+
+.set TMP, %zmm0
+
+.set A0, %zmm1
+.set A1, %zmm2
+.set A2, %zmm3
+.set A3, %zmm4
+
+.set M0, %zmm9
+.set M1, %zmm10
+.set M2, %zmm11
+.set M3, %zmm12
+
+.set X0, %zmm17
+.set X1, %zmm18
+.set X2, %zmm19
+.set X3, %zmm20
+
+.set Y_curr, %zmm25
+.set Y_prev, %zmm26
+.set B_curr, %zmm27
+.set B_prev, %zmm28
+.set K, %zmm29
+.set ZERO, %zmm30
+
+.set itr, %r10
+
+.macro VMPADD52LUQ_K_MUL_X0_PLUS_Y_CURR
+  .byte 0x62, 0x22, 0x95, 0x40, 0xb4, 0xc9
+.endm
+
+.macro VMPADD52HUQ_A_MUL_B_PREV_PLUS_X
+  .byte 0x62, 0x82, 0xf5, 0x48, 0xb5, 0xcc
+  .byte 0x62, 0x82, 0xed, 0x48, 0xb5, 0xd4
+  .byte 0x62, 0x82, 0xe5, 0x48, 0xb5, 0xdc
+  .byte 0x62, 0x82, 0xdd, 0x48, 0xb5, 0xe4
+.endm
+
+.macro VMPADD52HUQ_M_MUL_Y_PREV_PLUS_X
+  .byte 0x62, 0x82, 0xb5, 0x48, 0xb5, 0xca
+  .byte 0x62, 0x82, 0xad, 0x48, 0xb5, 0xd2
+  .byte 0x62, 0x82, 0xa5, 0x48, 0xb5, 0xda
+  .byte 0x62, 0x82, 0x9d, 0x48, 0xb5, 0xe2
+.endm
+
+.macro VMPADD52LUQ_A0_MUL_B_CURR_PLUS_X0
+  .byte 0x62, 0x82, 0xf5, 0x48, 0xb4, 0xcb
+.endm
+
+.macro VMPADD52LUQ_A_MUL_B_CURR_PLUS_X
+  .byte 0x62, 0x82, 0xed, 0x48, 0xb4, 0xd3
+  .byte 0x62, 0x82, 0xe5, 0x48, 0xb4, 0xdb
+  .byte 0x62, 0x82, 0xdd, 0x48, 0xb4, 0xe3
+.endm
+
+.macro VMPADD52LUQ_M_MUL_Y_CURR_PLUS_X
+  .byte 0x62, 0x82, 0xb5, 0x48, 0xb4, 0xc9
+  .byte 0x62, 0x82, 0xad, 0x48, 0xb4, 0xd1
+  .byte 0x62, 0x82, 0xa5, 0x48, 0xb4, 0xd9
+  .byte 0x62, 0x82, 0x9d, 0x48, 0xb4, 0xe1
+.endm
+
+
+################################################################################
+# AMM in AVX3 + mpadd52hlq
+# void AMM_1536_IFMA(uint64_t* res, uint64_t* a64, uint64_t* b64, uint64_t* m64, uint64_t k0)
+# Almost Montgomery Multiplication with Montgomery Friendly modulus
+# optimized for 30 redundant (radix 2^52) word operands
+# 
+# resPtr - 30 qwords for words <= 30
+# aPtr - 30 qwords for words <= 30
+# bPtr - 30 qwords for words <= 30
+# mPtr - 30 qwords for words <= 30
+
+.set resPtr, %rdi
+.set aPtr,   %rsi
+.set bPtr,   %rdx
+.set mPtr,   %rcx
+.set k0,     %r8
+.set words,  %r9
+
+.type AMM_1536_IFMA,@function
+.globl AMM_1536_IFMA
+.hidden AMM_1536_IFMA
+.align 16
+AMM_1536_IFMA:
+
+  push %rbx
+  push %r10
+  push %r11
+  push %r12
+  push %r13
+  push %r14
+  sub $64, %rsp
+
+  #masking for only 1 elements
+  mov   $0x1,%rax
+  mov   $0x3f,%rbx
+  kmovw  %eax,%k1
+  kmovw  %ebx,%k2
+
+  vpxorq ZERO, ZERO, ZERO
+  vpbroadcastq  k0, K
+
+  vpxorq B_curr, B_curr, B_curr
+  vpxorq Y_curr, Y_curr, Y_curr
+
+  #Load a
+  vmovdqu64 64*0(aPtr), A0
+  vmovdqu64 64*1(aPtr), A1
+  vmovdqu64 64*2(aPtr), A2
+  #mov only 4 elements zero the rest
+  vmovdqu64 64*3(aPtr), A3{%k2}{z}
+
+  #Load m
+  vmovdqu64 64*0(mPtr), M0
+  vmovdqu64 64*1(mPtr), M1
+  vmovdqu64 64*2(mPtr), M2
+  #mov only 4 elements zero the rest
+  vmovdqu64 64*3(mPtr), M3{%k2}{z}
+
+  #X[i]=0
+  vpxorq X0, X0, X0
+  vpxorq X1, X1, X1
+  vpxorq X2, X2, X2
+  vpxorq X3, X3, X3
+
+  #counter for 30 digits 31 iterations
+  mov $31, itr
+
+.LAMM_1536_IFMA_loop:
+  vmovdqa64   Y_curr, Y_prev
+  vmovdqa64   B_curr, B_prev
+  vpbroadcastq    (bPtr), B_curr
+  lea     8(bPtr), bPtr
+
+  # Shift the X in zmms right by a word
+  vpsrlq  $52, X0, TMP
+  valignq $1, X0, X1, X0
+  vpaddq  TMP, X0, X0{%k1}
+
+  valignq $1, X1, X2, X1
+  valignq $1, X2, X3, X2
+  valignq $1, X3, ZERO, X3
+
+  # High multiplications
+  VMPADD52HUQ_A_MUL_B_PREV_PLUS_X
+
+  # Low multiplications
+  VMPADD52HUQ_M_MUL_Y_PREV_PLUS_X
+
+  dec  itr
+  je  .LAMM_1536_IFMA_loop_end
+
+  VMPADD52LUQ_A0_MUL_B_CURR_PLUS_X0
+  vpxorq      Y_curr, Y_curr, Y_curr
+  VMPADD52LUQ_K_MUL_X0_PLUS_Y_CURR
+
+  vpermq      Y_curr, ZERO, Y_curr
+
+  VMPADD52LUQ_A_MUL_B_CURR_PLUS_X
+  VMPADD52LUQ_M_MUL_Y_CURR_PLUS_X
+
+  jmp .LAMM_1536_IFMA_loop
+
+.LAMM_1536_IFMA_loop_end:
+
+  vmovdqu64 X0, 64*0(resPtr)
+  vmovdqu64 X1, 64*1(resPtr)
+  vmovdqu64 X2, 64*2(resPtr)
+
+  vmovdqu64 X3, (%rsp)
+
+  movq 0*8(%rsp), %r11
+  movq %r11, 64*3+0*8(resPtr)
+  movq 1*8(%rsp), %r11
+  movq %r11, 64*3+1*8(resPtr)
+  movq 2*8(%rsp), %r11
+  movq %r11, 64*3+2*8(resPtr)
+  movq 3*8(%rsp), %r11
+  movq %r11, 64*3+3*8(resPtr)
+  movq 4*8(%rsp), %r11
+  movq %r11, 64*3+4*8(resPtr)
+  movq 5*8(%rsp), %r11
+  movq %r11, 64*3+5*8(resPtr)
+
+.Lend:
+  mov  $30, itr
+  mov  $52, %r11
+  xor  %r14, %r14
+  mov  .Land_mask(%rip), %r12
+
+.LFixLoop:
+  adc  (resPtr), %r14
+  mov  %r14, %r13
+  and %r12, %r13
+  mov  %r13, (resPtr)
+  shrx %r11, %r14, %r14
+  lea  8(resPtr), resPtr
+  dec  itr
+  jnz  .LFixLoop
+
+  add $64, %rsp
+
+  pop %r14
+  pop %r13
+  pop %r12
+  pop %r11
+  pop %r10
+  pop %rbx
+
+  ret
+.size AMM_1536_IFMA, .-AMM_1536_IFMA
+
+################################################################################
+# Read the value with inedex 'idx' from the table 'tbl' in constant time.
+# The values are 1536 bit redundant representation, that means each value is 30
+# qwords. The values reside sequentially in the table.
+# The number of values in the table is 1<<w_size (2^w_size).
+# The assumed legal values are (not checked):
+# 0 < w_size < 7
+# 0 <= idx < 2^w_size
+#
+# void rsaz_select1536_vpmadd(uint64_t* res, uint64_t *tbl, int idx, int w_size);
+
+.type    rsaz_select1536_vpmadd,@function
+.globl   rsaz_select1536_vpmadd
+.hidden  rsaz_select1536_vpmadd
+
+rsaz_select1536_vpmadd:
+
+.set res, %rdi
+.set tbl, %rsi
+.set idx, %rdx
+.set w, %rcx
+
+.set RES0, %zmm0
+.set RES1, %zmm1
+.set RES2, %zmm2
+.set RES3, %zmm3
+
+.set ONE, %zmm8
+.set IDX, %zmm9
+.set CUR, %zmm10
+
+  sub $64, %rsp
+  
+  vpbroadcastq  idx, IDX  
+  mov  $1, %rax
+  vpbroadcastq  %rax, ONE
+  shl  %cl, %rax
+  
+  vpxorq CUR, CUR, CUR
+  
+1:
+  vpcmpeqq   IDX, CUR, %k2
+  vmovdqu64  64*0(tbl), RES0{%k2}
+  vmovdqu64  64*1(tbl), RES1{%k2}
+  vmovdqu64  64*2(tbl), RES2{%k2}
+  vmovdqu64  64*3(tbl), RES3{%k2}
+
+  vpaddq    ONE, CUR, CUR
+  lea      30*8(tbl), tbl
+  dec  %rax
+  jnz  1b
+
+  vmovdqu64  RES0, 64*0(res)
+  vmovdqu64  RES1, 64*1(res)
+  vmovdqu64  RES2, 64*2(res)
+  vmovdqu64  RES3, (%rsp)
+
+  movq 0*8(%rsp), %r11
+  movq %r11, 64*3+0*8(res)
+  movq 1*8(%rsp), %r11
+  movq %r11, 64*3+1*8(res)
+  movq 2*8(%rsp), %r11
+  movq %r11, 64*3+2*8(res)
+  movq 3*8(%rsp), %r11
+  movq %r11, 64*3+3*8(res)
+  movq 4*8(%rsp), %r11
+  movq %r11, 64*3+4*8(res)
+  movq 5*8(%rsp), %r11
+  movq %r11, 64*3+5*8(res)
+
+  add $64, %rsp
+ret
+
+.size rsaz_select1536_vpmadd, .-rsaz_select1536_vpmadd
diff -urN ../master_01_09_2015/crypto/bn/bn_exp.c ./crypto/bn/bn_exp.c
--- ../master_01_09_2015/crypto/bn/bn_exp.c	2015-09-01 11:25:10.102810656 +0300
+++ ./crypto/bn/bn_exp.c	2015-09-07 11:25:35.181778979 +0300
@@ -696,7 +696,17 @@
      * RSAZ exponentiation. For further information see
      * crypto/bn/rsaz_exp.c and accompanying assembly modules.
      */
-    if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024)
+    if ((24 == a->top) && (BN_num_bits(m) == 1536) && rsaz_vpmadd52_eligible())
+ 	  {
+      if (NULL == bn_wexpand(rr, 24)) goto err;
+      RSAZ_mod_exp_vpmadd52(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0], BN_num_bits(p), 1536, 1);
+      rr->top = 24;
+      rr->neg = 0;
+      bn_correct_top(rr);
+      ret = 1;
+      goto err;
+ 		}
+   	else if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024)
         && rsaz_avx2_eligible()) {
         if (NULL == bn_wexpand(rr, 16))
             goto err;
diff -urN ../master_01_09_2015/crypto/bn/rsaz_exp.c ./crypto/bn/rsaz_exp.c
--- ../master_01_09_2015/crypto/bn/rsaz_exp.c	2015-09-01 11:25:10.273899904 +0300
+++ ./crypto/bn/rsaz_exp.c	2015-09-10 08:31:45.108977339 +0300
@@ -35,7 +35,7 @@
 *                                                                            *
 ******************************************************************************
 * Developers and authors:                                                    *
-* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
+* Shay Gueron (1, 2), Vlad Krasnov (1) and Nir Drucker(1,2)                  *
 * (1) Intel Corporation, Israel Development Center, Haifa, Israel            *
 * (2) University of Haifa, Israel                                            *
 *****************************************************************************/
@@ -337,6 +337,272 @@
     OPENSSL_cleanse(storage, sizeof(storage));
 }
 
+ALIGN64 static const BN_ULONG two96_red52[30] =
+ 	{0,1UL<<44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+void AMM_1536_IFMA(uint64_t res[40], const uint64_t a[40], const uint64_t b[40], const uint64_t m[40], uint64_t k0);
+void rsaz_select1536_vpmadd(unsigned long out[40], const unsigned long *tbl, int idx, int w_size);
+
+#define IFMA_MASK ((uint64_t)0xFFFFFFFFFFFFF);
+
+/* Convert 1536 bit value in, into the redundant representation radix 2^52
+   the result is 30 redundant words */
+static void norm2red52_1536(BN_ULONG out[30], const BN_ULONG in[24])
+{
+  int i;
+  uint8_t* in_str = (uint8_t*)in;
+
+  for(i=0; i<15; i++)
+  {
+    out[i*2] = (*(BN_ULONG*)in_str) & IFMA_MASK;
+    in_str+=6;
+    out[i*2+1] = ((*(BN_ULONG*)in_str) >> 4) & IFMA_MASK;
+    in_str+=7;
+  }
+
+  out[29] &=  0xfffffff;
+}
+
+/* Convert 1536 bit value in from redundant representation (occupying 60 redundant words
+   into the regular reperesentation, the result is 48 qwords */
+static void red2norm52_1536(BN_ULONG out[24], const BN_ULONG in[30])
+{
+  int i;
+  uint8_t* out_str = (uint8_t*)out;
+  for(i=0; i<24; i++) out[i] = 0;
+  for(i=0; i<15; i++)
+  {
+    (*(BN_ULONG*)out_str) = in[i*2];
+    out_str+=6;
+    (*(BN_ULONG*)out_str) ^= in[i*2+1] << 4;
+    out_str+=7;
+  }
+}
+
+static unsigned long LZCNT(unsigned long in)
+{
+    unsigned long res;
+    asm("lzcnt %1, %0\n\t" : "=r"(res): "r"(in));
+ 
+    return res;
+}
+ 
+static unsigned long BZHI(unsigned long in, unsigned long idx)
+{
+
+    unsigned long res = in;
+
+   //asm("bzhi %2, %1, %0\n\t" : "=r"(res): "r"(in), "r"(idx));
+
+    res <<= idx;
+    res >>= idx;
+    return res;
+}
+ 
+ 
+int RSAZ_mod_exp_vpmadd52(
+ 	BN_ULONG *result,
+ 	const BN_ULONG *base_norm,
+ 	const BN_ULONG *exponent,
+ 	const BN_ULONG *m_norm,
+ 	const BN_ULONG *RR,
+ 	BN_ULONG k0,
+ 	int exponent_bits, int mod_bits, int const_time)
+ {
+     void (*select)(unsigned long *, const unsigned long *, int, int) = NULL;
+     void (*norm2red)(unsigned long *, const unsigned long *) = NULL;
+     void (*red2norm)(unsigned long *, const unsigned long *) = NULL;
+     void (*AMM)(uint64_t*, const uint64_t*, const uint64_t*, const uint64_t*, uint64_t k0) = NULL;
+      
+     unsigned long *m, *mod_mul_result, *temp, *R2, *a_tag, *base;
+     unsigned long const *conv_help;
+ 
+     int window_size;
+     int operand_words;
+ 
+     unsigned long *table_s = NULL;
+ 
+     int ret = 0;
+     int i;
+ 
+     unsigned char *space = NULL;
+ 
+     //__attribute__((aligned(4096))) unsigned char space[12160];
+ 
+     if((!const_time) && (exponent_bits <= 64))
+     {
+         window_size = 0;
+     }
+     else if(exponent_bits <= 5)
+     {
+         window_size = 1;
+     }
+     else if(exponent_bits <= 20)
+     {
+         window_size = 2;
+     }
+     else if(exponent_bits <= 512)
+     {
+         window_size = 4;
+     }
+     else
+     {
+         window_size = 5;
+     }
+ 
+     if(mod_bits == 1536)
+     {
+         select = rsaz_select1536_vpmadd;
+         norm2red = norm2red52_1536;
+         red2norm = red2norm52_1536;
+         AMM = AMM_1536_IFMA;
+ 
+         operand_words = 30;
+ 
+         conv_help = two96_red52;
+ 
+     }
+     else
+     {
+ 		goto bail;
+         /* This function should never be called with other modulii, should be checked externally */
+     }
+ 
+ 
+     space = (unsigned char *)memalign(4096, (1<<window_size)* operand_words*8 + operand_words*6*8);
+ 
+     if(NULL == space)
+     {
+         ret = 0;
+         goto bail;
+     }
+ 
+     m = (unsigned long *)&space[operand_words*8*0];
+     mod_mul_result = (unsigned long *)&space[operand_words*8*1];
+     temp = (unsigned long *)&space[operand_words*8*2];
+     a_tag = (unsigned long *)&space[operand_words*8*3];
+     R2 = (unsigned long *)&space[operand_words*8*4];
+     base = (unsigned long *)&space[operand_words*8*5];
+     table_s = (unsigned long *)&space[operand_words*8*6];
+ 
+     norm2red(m, m_norm);	
+     norm2red(base, base_norm);
+     norm2red(R2, RR);
+ 
+     AMM(R2, R2, R2, m, k0);
+     AMM(R2, R2, conv_help, m, k0);
+ 
+     if(!window_size)
+     {
+         uint64_t exp = exponent[0];
+         int pos = LZCNT(exp);
+ 
+         AMM(a_tag, R2, base, m, k0);
+         memcpy(mod_mul_result, a_tag, operand_words*8);
+ 
+         if(exp>1)
+         {
+             exp = BZHI(exp,63 - pos);
+             do
+             {
+                 if(exp)
+                 {
+                     for(i=0; i<LZCNT(exp) - pos; i++)
+                     {
+                         AMM(mod_mul_result, mod_mul_result, mod_mul_result, m, k0);
+                     }
+                     AMM(mod_mul_result, mod_mul_result, a_tag, m, k0);
+                 }
+                 else
+                 {
+                     for(i=0; i<63 - pos; i++)
+                     {
+                         AMM(mod_mul_result, mod_mul_result, mod_mul_result, m, k0);
+                     }
+                 }
+       
+                 pos = LZCNT(exp);
+                 exp = BZHI(exp,63 - pos);
+       
+             } while(exp || pos<63);
+         }
+         AMM(mod_mul_result, mod_mul_result, one, m, k0);
+         red2norm(result, mod_mul_result);
+     }
+     else
+     {
+         if(window_size == 1)
+         {
+             // table[0]
+            AMM(mod_mul_result, R2, one, m, k0);
+             memcpy(&table_s[0*operand_words], mod_mul_result, operand_words*8);
+             // table[1]
+             AMM(a_tag, R2, base, m, k0);
+             memcpy(&table_s[1*operand_words], a_tag, operand_words*8);
+         }
+         else
+         {
+             // table[0]
+             AMM(mod_mul_result, R2, one, m, k0);
+             memcpy(&table_s[0*operand_words], mod_mul_result, operand_words*8);
+             // table[1]
+             AMM(a_tag, R2, base, m, k0);
+             memcpy(&table_s[1*operand_words], a_tag, operand_words*8);
+             for(i=2; i<(1<<window_size); i++)
+             {
+                 AMM(mod_mul_result, a_tag, &table_s[(i-1)*operand_words], m, k0);
+                 memcpy(&table_s[i*operand_words], mod_mul_result, operand_words*8);
+             }
+         }
+         // load first window
+         unsigned char *p_str = (unsigned char*)exponent;
+         int index = exponent_bits - window_size;
+         int mask = (1<<window_size) - 1;
+         int wvalue = *((unsigned short*)&p_str[index/8]);
+         wvalue = (wvalue>> (index%8)) & mask;
+         index-=window_size;
+ 
+         select(mod_mul_result, table_s, wvalue, window_size);
+ 
+         while(index >= 0)   // loop for the remaining windows
+         {
+             for(i=0; i<window_size; i++)
+             {
+                 AMM(mod_mul_result, mod_mul_result, mod_mul_result, m, k0);
+             }
+ 
+             wvalue = *((unsigned short*)&p_str[index/8]);
+             wvalue = (wvalue>> (index%8)) & mask;
+             index-=window_size;
+ 
+ 			select(temp, table_s, wvalue, window_size);
+             AMM(mod_mul_result, mod_mul_result, temp, m, k0);
+         }
+         if(index > -window_size) // The last window
+         {
+             int last_window_mask = (1<<(exponent_bits%window_size)) - 1;
+             for(i=0; i<window_size + index; i++)
+             {
+                 AMM(mod_mul_result, mod_mul_result, mod_mul_result, m, k0);
+             }
+             wvalue = p_str[0] & last_window_mask;
+ 			select(temp, table_s, wvalue, window_size);
+             AMM(mod_mul_result, mod_mul_result, temp, m, k0);
+         }
+         AMM(mod_mul_result, mod_mul_result, one, m, k0);
+         red2norm(result, mod_mul_result);
+     }
+ 
+     ret = 1;
+ bail:
+     if(space)
+     {
+ 		memset(space, 0, (1<<window_size)* operand_words*8 + operand_words*6*8);
+         free(space);
+     }
+     return ret;
+ }
+
 #else
 
 # if defined(PEDANTIC) || defined(__DECC) || defined(__clang__)
diff -urN ../master_01_09_2015/patch ./patch
--- ../master_01_09_2015/patch	1970-01-01 02:00:00.000000000 +0200
+++ ./patch	2015-09-10 08:33:50.102944629 +0300
@@ -0,0 +1,690 @@
+Files ../master_01_09_2015/.git/index and ./.git/index differ
+diff -urN ../master_01_09_2015/Configure ./Configure
+--- ../master_01_09_2015/Configure	2015-09-01 11:25:05.962599731 +0300
++++ ./Configure	2015-09-07 11:43:54.702684155 +0300
+@@ -338,7 +338,7 @@
+     x86_64_asm => {
+ 	template	=> 1,
+ 	cpuid_obj       => "x86_64cpuid.o",
+-	bn_obj          => "x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o",
++	bn_obj          => "x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o rsaz-vpmadd.o",
+ 	ec_obj          => "ecp_nistz256.o ecp_nistz256-x86_64.o",
+ 	aes_obj         => "aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o",
+ 	md5_obj         => "md5-x86_64.o",
+diff -urN ../master_01_09_2015/Makefile ./Makefile
+--- ../master_01_09_2015/Makefile	2015-09-07 18:16:46.105739718 +0300
++++ ./Makefile	2015-09-07 11:43:57.419219498 +0300
+@@ -91,7 +91,7 @@
+ 
+ # CPUID module collects small commonly used assembler snippets
+ CPUID_OBJ= x86_64cpuid.o
+-BN_ASM= x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o
++BN_ASM= x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o rsaz-vpmadd.o
+ EC_ASM= ecp_nistz256.o ecp_nistz256-x86_64.o
+ DES_ENC= des_enc.o fcrypt_b.o
+ AES_ENC= aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o
+diff -urN ../master_01_09_2015/crypto/bn/Makefile ./crypto/bn/Makefile
+--- ../master_01_09_2015/crypto/bn/Makefile	2015-09-01 11:25:09.704858924 +0300
++++ ./crypto/bn/Makefile	2015-09-07 11:48:04.208658125 +0300
+@@ -109,6 +109,8 @@
+ 	$(PERL) asm/rsaz-x86_64.pl $(PERLASM_SCHEME) > $@
+ rsaz-avx2.s:	asm/rsaz-avx2.pl 
+ 	$(PERL) asm/rsaz-avx2.pl $(PERLASM_SCHEME) > $@
++rsaz-vpmadd.s:	asm/rsaz-vpmadd.S
++	$(CC) $(CFLAGS) -E asm/rsaz-vpmadd.S > $@
+ 
+ bn-ia64.s:	asm/ia64.S
+ 	$(CC) $(CFLAGS) -E asm/ia64.S > $@
+diff -urN ../master_01_09_2015/crypto/bn/asm/rsaz-vpmadd.S ./crypto/bn/asm/rsaz-vpmadd.S
+--- ../master_01_09_2015/crypto/bn/asm/rsaz-vpmadd.S	1970-01-01 02:00:00.000000000 +0200
++++ ./crypto/bn/asm/rsaz-vpmadd.S	2015-09-10 08:32:05.049881491 +0300
+@@ -0,0 +1,342 @@
++##############################################################################
++# Copyright 2015 Intel Corporation                                           #
++#                                                                            #
++# Licensed under the Apache License, Version 2.0 (the "License");            #
++# you may not use this file except in compliance with the License.           #
++# You may obtain a copy of the License at                                    #
++#                                                                            #
++#    http://www.apache.org/licenses/LICENSE-2.0                              #
++#                                                                            #
++# Unless required by applicable law or agreed to in writing, software        #
++# distributed under the License is distributed on an "AS IS" BASIS,          #
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   #
++# See the License for the specific language governing permissions and        #
++# limitations under the License.                                             #
++#                                                                            #
++##############################################################################
++#                                                                            # 
++#  Developers and authors:                                                   # 
++#  Shay Gueron, and Nir Drucker                                              # 
++#  (1) Intel Corporation, Israel Development Center                          # 
++#  (2) University of Haifa                                                   # 
++#  Reference:                                                                # 
++#  S.Gueron and V.Krasnov, "Software Implementation of Modular Exponentiation#
++#  , Using Advanced Vector Instructions Architectures"                       #
++#  S. Gueron, V. Krasnov,                                                    #
++#  "New CPU instructions for speeding up modular exponentiation"             #
++#                                                                            #
++##############################################################################
++
++# Prototypes of functions in this document:
++# int rsaz_vpmadd52_eligible();
++# void AMM_1536_IFMA(uint64_t *rp, const uint64_t *aptr, const uint64_t *bptr, const uint64_t *nptr, uint64_t n0);
++# void rsaz_select1536_vpmadd(uint64_t *val, uint64_t *in_t, int index, int limit);
++
++.align 64
++.Land_mask:
++  .quad  0xfffffffffffff
++
++################################################################################
++.globl	rsaz_vpmadd52_eligible
++.type	rsaz_vpmadd52_eligible,@function
++.align	32
++rsaz_vpmadd52_eligible:
++	movl	OPENSSL_ia32cap_P+8(%rip),%eax
++	movl	%eax, %ecx
++	
++	shrl	$21, %eax	# vpmadd52 flag
++	shrl	$17, %ecx	# avx3.1 flag
++	and		%ecx, %eax
++	andl	$1,%eax
++	ret
++.size	rsaz_vpmadd52_eligible,.-rsaz_vpmadd52_eligible
++################################################################################
++
++
++.set TMP, %zmm0
++
++.set A0, %zmm1
++.set A1, %zmm2
++.set A2, %zmm3
++.set A3, %zmm4
++
++.set M0, %zmm9
++.set M1, %zmm10
++.set M2, %zmm11
++.set M3, %zmm12
++
++.set X0, %zmm17
++.set X1, %zmm18
++.set X2, %zmm19
++.set X3, %zmm20
++
++.set Y_curr, %zmm25
++.set Y_prev, %zmm26
++.set B_curr, %zmm27
++.set B_prev, %zmm28
++.set K, %zmm29
++.set ZERO, %zmm30
++
++.set itr, %r10
++
++.macro VMPADD52LUQ_K_MUL_X0_PLUS_Y_CURR
++  .byte 0x62, 0x22, 0x95, 0x40, 0xb4, 0xc9
++.endm
++
++.macro VMPADD52HUQ_A_MUL_B_PREV_PLUS_X
++  .byte 0x62, 0x82, 0xf5, 0x48, 0xb5, 0xcc
++  .byte 0x62, 0x82, 0xed, 0x48, 0xb5, 0xd4
++  .byte 0x62, 0x82, 0xe5, 0x48, 0xb5, 0xdc
++  .byte 0x62, 0x82, 0xdd, 0x48, 0xb5, 0xe4
++.endm
++
++.macro VMPADD52HUQ_M_MUL_Y_PREV_PLUS_X
++  .byte 0x62, 0x82, 0xb5, 0x48, 0xb5, 0xca
++  .byte 0x62, 0x82, 0xad, 0x48, 0xb5, 0xd2
++  .byte 0x62, 0x82, 0xa5, 0x48, 0xb5, 0xda
++  .byte 0x62, 0x82, 0x9d, 0x48, 0xb5, 0xe2
++.endm
++
++.macro VMPADD52LUQ_A0_MUL_B_CURR_PLUS_X0
++  .byte 0x62, 0x82, 0xf5, 0x48, 0xb4, 0xcb
++.endm
++
++.macro VMPADD52LUQ_A_MUL_B_CURR_PLUS_X
++  .byte 0x62, 0x82, 0xed, 0x48, 0xb4, 0xd3
++  .byte 0x62, 0x82, 0xe5, 0x48, 0xb4, 0xdb
++  .byte 0x62, 0x82, 0xdd, 0x48, 0xb4, 0xe3
++.endm
++
++.macro VMPADD52LUQ_M_MUL_Y_CURR_PLUS_X
++  .byte 0x62, 0x82, 0xb5, 0x48, 0xb4, 0xc9
++  .byte 0x62, 0x82, 0xad, 0x48, 0xb4, 0xd1
++  .byte 0x62, 0x82, 0xa5, 0x48, 0xb4, 0xd9
++  .byte 0x62, 0x82, 0x9d, 0x48, 0xb4, 0xe1
++.endm
++
++
++################################################################################
++# AMM in AVX3 + mpadd52hlq
++# void AMM_1536_IFMA(uint64_t* res, uint64_t* a64, uint64_t* b64, uint64_t* m64, uint64_t k0)
++# Almost Montgomery Multiplication with Montgomery Friendly modulus
++# optimized for 30 redundant (radix 2^52) word operands
++# 
++# resPtr - 30 qwords for words <= 30
++# aPtr - 30 qwords for words <= 30
++# bPtr - 30 qwords for words <= 30
++# mPtr - 30 qwords for words <= 30
++
++.set resPtr, %rdi
++.set aPtr,   %rsi
++.set bPtr,   %rdx
++.set mPtr,   %rcx
++.set k0,     %r8
++.set words,  %r9
++
++.type AMM_1536_IFMA,@function
++.globl AMM_1536_IFMA
++.hidden AMM_1536_IFMA
++.align 16
++AMM_1536_IFMA:
++
++  push %rbx
++  push %r10
++  push %r11
++  push %r12
++  push %r13
++  push %r14
++  sub $64, %rsp
++
++  #masking for only 1 elements
++  mov   $0x1,%rax
++  mov   $0x3f,%rbx
++  kmovw  %eax,%k1
++  kmovw  %ebx,%k2
++
++  vpxorq ZERO, ZERO, ZERO
++  vpbroadcastq  k0, K
++
++  vpxorq B_curr, B_curr, B_curr
++  vpxorq Y_curr, Y_curr, Y_curr
++
++  #Load a
++  vmovdqu64 64*0(aPtr), A0
++  vmovdqu64 64*1(aPtr), A1
++  vmovdqu64 64*2(aPtr), A2
++  #mov only 4 elements zero the rest
++  vmovdqu64 64*3(aPtr), A3{%k2}{z}
++
++  #Load m
++  vmovdqu64 64*0(mPtr), M0
++  vmovdqu64 64*1(mPtr), M1
++  vmovdqu64 64*2(mPtr), M2
++  #mov only 4 elements zero the rest
++  vmovdqu64 64*3(mPtr), M3{%k2}{z}
++
++  #X[i]=0
++  vpxorq X0, X0, X0
++  vpxorq X1, X1, X1
++  vpxorq X2, X2, X2
++  vpxorq X3, X3, X3
++
++  #counter for 30 digits 31 iterations
++  mov $31, itr
++
++.LAMM_1536_IFMA_loop:
++  vmovdqa64   Y_curr, Y_prev
++  vmovdqa64   B_curr, B_prev
++  vpbroadcastq    (bPtr), B_curr
++  lea     8(bPtr), bPtr
++
++  # Shift the X in zmms right by a word
++  vpsrlq  $52, X0, TMP
++  valignq $1, X0, X1, X0
++  vpaddq  TMP, X0, X0{%k1}
++
++  valignq $1, X1, X2, X1
++  valignq $1, X2, X3, X2
++  valignq $1, X3, ZERO, X3
++
++  # High multiplications
++  VMPADD52HUQ_A_MUL_B_PREV_PLUS_X
++
++  # Low multiplications
++  VMPADD52HUQ_M_MUL_Y_PREV_PLUS_X
++
++  dec  itr
++  je  .LAMM_1536_IFMA_loop_end
++
++  VMPADD52LUQ_A0_MUL_B_CURR_PLUS_X0
++  vpxorq      Y_curr, Y_curr, Y_curr
++  VMPADD52LUQ_K_MUL_X0_PLUS_Y_CURR
++
++  vpermq      Y_curr, ZERO, Y_curr
++
++  VMPADD52LUQ_A_MUL_B_CURR_PLUS_X
++  VMPADD52LUQ_M_MUL_Y_CURR_PLUS_X
++
++  jmp .LAMM_1536_IFMA_loop
++
++.LAMM_1536_IFMA_loop_end:
++
++  vmovdqu64 X0, 64*0(resPtr)
++  vmovdqu64 X1, 64*1(resPtr)
++  vmovdqu64 X2, 64*2(resPtr)
++
++  vmovdqu64 X3, (%rsp)
++
++  movq 0*8(%rsp), %r11
++  movq %r11, 64*3+0*8(resPtr)
++  movq 1*8(%rsp), %r11
++  movq %r11, 64*3+1*8(resPtr)
++  movq 2*8(%rsp), %r11
++  movq %r11, 64*3+2*8(resPtr)
++  movq 3*8(%rsp), %r11
++  movq %r11, 64*3+3*8(resPtr)
++  movq 4*8(%rsp), %r11
++  movq %r11, 64*3+4*8(resPtr)
++  movq 5*8(%rsp), %r11
++  movq %r11, 64*3+5*8(resPtr)
++
++.Lend:
++  mov  $30, itr
++  mov  $52, %r11
++  xor  %r14, %r14
++  mov  .Land_mask(%rip), %r12
++
++.LFixLoop:
++  adc  (resPtr), %r14
++  mov  %r14, %r13
++  and %r12, %r13
++  mov  %r13, (resPtr)
++  shrx %r11, %r14, %r14
++  lea  8(resPtr), resPtr
++  dec  itr
++  jnz  .LFixLoop
++
++  add $64, %rsp
++
++  pop %r14
++  pop %r13
++  pop %r12
++  pop %r11
++  pop %r10
++  pop %rbx
++
++  ret
++.size AMM_1536_IFMA, .-AMM_1536_IFMA
++
++################################################################################
++# Read the value with inedex 'idx' from the table 'tbl' in constant time.
++# The values are 1536 bit redundant representation, that means each value is 30
++# qwords. The values reside sequentially in the table.
++# The number of values in the table is 1<<w_size (2^w_size).
++# The assumed legal values are (not checked):
++# 0 < w_size < 7
++# 0 <= idx < 2^w_size
++#
++# void rsaz_select1536_vpmadd(uint64_t* res, uint64_t *tbl, int idx, int w_size);
++
++.type    rsaz_select1536_vpmadd,@function
++.globl   rsaz_select1536_vpmadd
++.hidden  rsaz_select1536_vpmadd
++
++rsaz_select1536_vpmadd:
++
++.set res, %rdi
++.set tbl, %rsi
++.set idx, %rdx
++.set w, %rcx
++
++.set RES0, %zmm0
++.set RES1, %zmm1
++.set RES2, %zmm2
++.set RES3, %zmm3
++
++.set ONE, %zmm8
++.set IDX, %zmm9
++.set CUR, %zmm10
++
++  sub $64, %rsp
++  
++  vpbroadcastq  idx, IDX  
++  mov  $1, %rax
++  vpbroadcastq  %rax, ONE
++  shl  %cl, %rax
++  
++  vpxorq CUR, CUR, CUR
++  
++1:
++  vpcmpeqq   IDX, CUR, %k2
++  vmovdqu64  64*0(tbl), RES0{%k2}
++  vmovdqu64  64*1(tbl), RES1{%k2}
++  vmovdqu64  64*2(tbl), RES2{%k2}
++  vmovdqu64  64*3(tbl), RES3{%k2}
++
++  vpaddq    ONE, CUR, CUR
++  lea      30*8(tbl), tbl
++  dec  %rax
++  jnz  1b
++
++  vmovdqu64  RES0, 64*0(res)
++  vmovdqu64  RES1, 64*1(res)
++  vmovdqu64  RES2, 64*2(res)
++  vmovdqu64  RES3, (%rsp)
++
++  movq 0*8(%rsp), %r11
++  movq %r11, 64*3+0*8(res)
++  movq 1*8(%rsp), %r11
++  movq %r11, 64*3+1*8(res)
++  movq 2*8(%rsp), %r11
++  movq %r11, 64*3+2*8(res)
++  movq 3*8(%rsp), %r11
++  movq %r11, 64*3+3*8(res)
++  movq 4*8(%rsp), %r11
++  movq %r11, 64*3+4*8(res)
++  movq 5*8(%rsp), %r11
++  movq %r11, 64*3+5*8(res)
++
++  add $64, %rsp
++ret
++
++.size rsaz_select1536_vpmadd, .-rsaz_select1536_vpmadd
+diff -urN ../master_01_09_2015/crypto/bn/bn_exp.c ./crypto/bn/bn_exp.c
+--- ../master_01_09_2015/crypto/bn/bn_exp.c	2015-09-01 11:25:10.102810656 +0300
++++ ./crypto/bn/bn_exp.c	2015-09-07 11:25:35.181778979 +0300
+@@ -696,7 +696,17 @@
+      * RSAZ exponentiation. For further information see
+      * crypto/bn/rsaz_exp.c and accompanying assembly modules.
+      */
+-    if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024)
++    if ((24 == a->top) && (BN_num_bits(m) == 1536) && rsaz_vpmadd52_eligible())
++ 	  {
++      if (NULL == bn_wexpand(rr, 24)) goto err;
++      RSAZ_mod_exp_vpmadd52(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0], BN_num_bits(p), 1536, 1);
++      rr->top = 24;
++      rr->neg = 0;
++      bn_correct_top(rr);
++      ret = 1;
++      goto err;
++ 		}
++   	else if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024)
+         && rsaz_avx2_eligible()) {
+         if (NULL == bn_wexpand(rr, 16))
+             goto err;
+diff -urN ../master_01_09_2015/crypto/bn/rsaz_exp.c ./crypto/bn/rsaz_exp.c
+--- ../master_01_09_2015/crypto/bn/rsaz_exp.c	2015-09-01 11:25:10.273899904 +0300
++++ ./crypto/bn/rsaz_exp.c	2015-09-10 08:31:45.108977339 +0300
+@@ -35,7 +35,7 @@
+ *                                                                            *
+ ******************************************************************************
+ * Developers and authors:                                                    *
+-* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
++* Shay Gueron (1, 2), Vlad Krasnov (1) and Nir Drucker(1,2)                  *
+ * (1) Intel Corporation, Israel Development Center, Haifa, Israel            *
+ * (2) University of Haifa, Israel                                            *
+ *****************************************************************************/
+@@ -337,6 +337,272 @@
+     OPENSSL_cleanse(storage, sizeof(storage));
+ }
+ 
++ALIGN64 static const BN_ULONG two96_red52[30] =
++ 	{0,1UL<<44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
++
++void AMM_1536_IFMA(uint64_t res[40], const uint64_t a[40], const uint64_t b[40], const uint64_t m[40], uint64_t k0);
++void rsaz_select1536_vpmadd(unsigned long out[40], const unsigned long *tbl, int idx, int w_size);
++
++#define IFMA_MASK ((uint64_t)0xFFFFFFFFFFFFF);
++
++/* Convert 1536 bit value in, into the redundant representation radix 2^52
++   the result is 30 redundant words */
++static void norm2red52_1536(BN_ULONG out[30], const BN_ULONG in[24])
++{
++  int i;
++  uint8_t* in_str = (uint8_t*)in;
++
++  for(i=0; i<15; i++)
++  {
++    out[i*2] = (*(BN_ULONG*)in_str) & IFMA_MASK;
++    in_str+=6;
++    out[i*2+1] = ((*(BN_ULONG*)in_str) >> 4) & IFMA_MASK;
++    in_str+=7;
++  }
++
++  out[29] &=  0xfffffff;
++}
++
++/* Convert 1536 bit value in from redundant representation (occupying 60 redundant words
++   into the regular reperesentation, the result is 48 qwords */
++static void red2norm52_1536(BN_ULONG out[24], const BN_ULONG in[30])
++{
++  int i;
++  uint8_t* out_str = (uint8_t*)out;
++  for(i=0; i<24; i++) out[i] = 0;
++  for(i=0; i<15; i++)
++  {
++    (*(BN_ULONG*)out_str) = in[i*2];
++    out_str+=6;
++    (*(BN_ULONG*)out_str) ^= in[i*2+1] << 4;
++    out_str+=7;
++  }
++}
++
++static unsigned long LZCNT(unsigned long in)
++{
++    unsigned long res;
++    asm("lzcnt %1, %0\n\t" : "=r"(res): "r"(in));
++ 
++    return res;
++}
++ 
++static unsigned long BZHI(unsigned long in, unsigned long idx)
++{
++
++    unsigned long res = in;
++
++   //asm("bzhi %2, %1, %0\n\t" : "=r"(res): "r"(in), "r"(idx));
++
++    res <<= idx;
++    res >>= idx;
++    return res;
++}
++ 
++ 
++int RSAZ_mod_exp_vpmadd52(
++ 	BN_ULONG *result,
++ 	const BN_ULONG *base_norm,
++ 	const BN_ULONG *exponent,
++ 	const BN_ULONG *m_norm,
++ 	const BN_ULONG *RR,
++ 	BN_ULONG k0,
++ 	int exponent_bits, int mod_bits, int const_time)
++ {
++     void (*select)(unsigned long *, const unsigned long *, int, int) = NULL;
++     void (*norm2red)(unsigned long *, const unsigned long *) = NULL;
++     void (*red2norm)(unsigned long *, const unsigned long *) = NULL;
++     void (*AMM)(uint64_t*, const uint64_t*, const uint64_t*, const uint64_t*, uint64_t k0) = NULL;
++      
++     unsigned long *m, *mod_mul_result, *temp, *R2, *a_tag, *base;
++     unsigned long const *conv_help;
++ 
++     int window_size;
++     int operand_words;
++ 
++     unsigned long *table_s = NULL;
++ 
++     int ret = 0;
++     int i;
++ 
++     unsigned char *space = NULL;
++ 
++     //__attribute__((aligned(4096))) unsigned char space[12160];
++ 
++     if((!const_time) && (exponent_bits <= 64))
++     {
++         window_size = 0;
++     }
++     else if(exponent_bits <= 5)
++     {
++         window_size = 1;
++     }
++     else if(exponent_bits <= 20)
++     {
++         window_size = 2;
++     }
++     else if(exponent_bits <= 512)
++     {
++         window_size = 4;
++     }
++     else
++     {
++         window_size = 5;
++     }
++ 
++     if(mod_bits == 1536)
++     {
++         select = rsaz_select1536_vpmadd;
++         norm2red = norm2red52_1536;
++         red2norm = red2norm52_1536;
++         AMM = AMM_1536_IFMA;
++ 
++         operand_words = 30;
++ 
++         conv_help = two96_red52;
++ 
++     }
++     else
++     {
++ 		goto bail;
++         /* This function should never be called with other modulii, should be checked externally */
++     }
++ 
++ 
++     space = (unsigned char *)memalign(4096, (1<<window_size)* operand_words*8 + operand_words*6*8);
++ 
++     if(NULL == space)
++     {
++         ret = 0;
++         goto bail;
++     }
++ 
++     m = (unsigned long *)&space[operand_words*8*0];
++     mod_mul_result = (unsigned long *)&space[operand_words*8*1];
++     temp = (unsigned long *)&space[operand_words*8*2];
++     a_tag = (unsigned long *)&space[operand_words*8*3];
++     R2 = (unsigned long *)&space[operand_words*8*4];
++     base = (unsigned long *)&space[operand_words*8*5];
++     table_s = (unsigned long *)&space[operand_words*8*6];
++ 
++     norm2red(m, m_norm);	
++     norm2red(base, base_norm);
++     norm2red(R2, RR);
++ 
++     AMM(R2, R2, R2, m, k0);
++     AMM(R2, R2, conv_help, m, k0);
++ 
++     if(!window_size)
++     {
++         uint64_t exp = exponent[0];
++         int pos = LZCNT(exp);
++ 
++         AMM(a_tag, R2, base, m, k0);
++         memcpy(mod_mul_result, a_tag, operand_words*8);
++ 
++         if(exp>1)
++         {
++             exp = BZHI(exp,63 - pos);
++             do
++             {
++                 if(exp)
++                 {
++                     for(i=0; i<LZCNT(exp) - pos; i++)
++                     {
++                         AMM(mod_mul_result, mod_mul_result, mod_mul_result, m, k0);
++                     }
++                     AMM(mod_mul_result, mod_mul_result, a_tag, m, k0);
++                 }
++                 else
++                 {
++                     for(i=0; i<63 - pos; i++)
++                     {
++                         AMM(mod_mul_result, mod_mul_result, mod_mul_result, m, k0);
++                     }
++                 }
++       
++                 pos = LZCNT(exp);
++                 exp = BZHI(exp,63 - pos);
++       
++             } while(exp || pos<63);
++         }
++         AMM(mod_mul_result, mod_mul_result, one, m, k0);
++         red2norm(result, mod_mul_result);
++     }
++     else
++     {
++         if(window_size == 1)
++         {
++             // table[0]
++            AMM(mod_mul_result, R2, one, m, k0);
++             memcpy(&table_s[0*operand_words], mod_mul_result, operand_words*8);
++             // table[1]
++             AMM(a_tag, R2, base, m, k0);
++             memcpy(&table_s[1*operand_words], a_tag, operand_words*8);
++         }
++         else
++         {
++             // table[0]
++             AMM(mod_mul_result, R2, one, m, k0);
++             memcpy(&table_s[0*operand_words], mod_mul_result, operand_words*8);
++             // table[1]
++             AMM(a_tag, R2, base, m, k0);
++             memcpy(&table_s[1*operand_words], a_tag, operand_words*8);
++             for(i=2; i<(1<<window_size); i++)
++             {
++                 AMM(mod_mul_result, a_tag, &table_s[(i-1)*operand_words], m, k0);
++                 memcpy(&table_s[i*operand_words], mod_mul_result, operand_words*8);
++             }
++         }
++         // load first window
++         unsigned char *p_str = (unsigned char*)exponent;
++         int index = exponent_bits - window_size;
++         int mask = (1<<window_size) - 1;
++         int wvalue = *((unsigned short*)&p_str[index/8]);
++         wvalue = (wvalue>> (index%8)) & mask;
++         index-=window_size;
++ 
++         select(mod_mul_result, table_s, wvalue, window_size);
++ 
++         while(index >= 0)   // loop for the remaining windows
++         {
++             for(i=0; i<window_size; i++)
++             {
++                 AMM(mod_mul_result, mod_mul_result, mod_mul_result, m, k0);
++             }
++ 
++             wvalue = *((unsigned short*)&p_str[index/8]);
++             wvalue = (wvalue>> (index%8)) & mask;
++             index-=window_size;
++ 
++ 			select(temp, table_s, wvalue, window_size);
++             AMM(mod_mul_result, mod_mul_result, temp, m, k0);
++         }
++         if(index > -window_size) // The last window
++         {
++             int last_window_mask = (1<<(exponent_bits%window_size)) - 1;
++             for(i=0; i<window_size + index; i++)
++             {
++                 AMM(mod_mul_result, mod_mul_result, mod_mul_result, m, k0);
++             }
++             wvalue = p_str[0] & last_window_mask;
++ 			select(temp, table_s, wvalue, window_size);
++             AMM(mod_mul_result, mod_mul_result, temp, m, k0);
++         }
++         AMM(mod_mul_result, mod_mul_result, one, m, k0);
++         red2norm(result, mod_mul_result);
++     }
++ 
++     ret = 1;
++ bail:
++     if(space)
++     {
++ 		memset(space, 0, (1<<window_size)* operand_words*8 + operand_words*6*8);
++         free(space);
++     }
++     return ret;
++ }
++
+ #else
+ 
+ # if defined(PEDANTIC) || defined(__DECC) || defined(__clang__)
