Hi Andy!

I hope I addressed all your wishes w.r.t. the PadLock engine. Attached
is the interdiff since the last version, on my PadLock page is the full
patch (http://www.logix.cz/michal/devel/padlock).

Most recent changes are:
- Rewritten the aligner to use only a fixed-size buffer on the stack
  instead of malloc()ed space.
- Control word, IV and KEY are now in ctx->cipher_data
- Extended key is computed in place.
- Using padlock_bswapl() instead of htonl().

Do you now like it more? :-)

Michal Ludvig
-- 
* A mouse is a device used to point at the xterm you want to type in.
* Personal homepage - http://www.logix.cz/michal
Index: crypto/engine/eng_padlock.c
===================================================================
--- crypto/engine/eng_padlock.c.orig
+++ crypto/engine/eng_padlock.c
@@ -65,8 +65,6 @@
 #include <string.h>
 #include <inttypes.h>
 
-#include <netinet/in.h>        /* we need htonl() */
-
 #include <openssl/crypto.h>
 #include <openssl/dso.h>
 #include <openssl/engine.h>
@@ -323,14 +321,10 @@
                                      sizeof(padlock_cipher_nids[0]));
 
 /* Function prototypes ... */
-static int padlock_aes_init_key_128(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-                                   const unsigned char *iv, int enc);
-static int padlock_aes_init_key_192(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-                                   const unsigned char *iv, int enc);
-static int padlock_aes_init_key_256(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-                                   const unsigned char *iv, int enc);
+static int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                               const unsigned char *iv, int enc);
 static int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
-                             const unsigned char *in, unsigned int inl);
+                             const unsigned char *in, unsigned int nbytes);
 
 /* Some AES-related constants */
 #define AES_BLOCK_SIZE         16
@@ -341,14 +335,31 @@
 #define AES_KEY_WORDS          (4 * (AES_MAXNR + 1))
 #define        AES_KEY_BYTES           (AES_KEY_WORDS * 4)
 
+/* Control word. */
+union cword {
+       uint32_t cword[4];
+       struct {
+               int rounds:4;
+               int algo:3;
+               int keygen:1;
+               int interm:1;
+               int encdec:1;
+               int ksize:2;
+       } b;
+};
+
 /* Here we store the plain key for AES128
    and the extended key for AES192/AES256 */
-struct padlock_aes_key
+struct padlock_cipher_data
 {
-       uint32_t aes_key[AES_KEY_WORDS];
-       int extended;
+       uint8_t iv[AES_BLOCK_SIZE];     /* Initialization vector */
+       union cword cword;              /* Control word */
+       AES_KEY ks;                     /* Encryption key */
+       uint8_t *key;                   /* Encryption key pointer */
 };
 
+#define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)(ctx->cipher_data + 
((0x10 - ((size_t)(ctx->cipher_data) & 0x0F)) & 0x0F)))
+
 /* Declaring so many ciphers by hand would be a pain.
    Instead introduce a bit of preprocessor magic :-) */
 #define        DECLARE_AES_EVP(ksize,lmode,umode)      \
@@ -358,10 +369,10 @@
        AES_KEY_SIZE_##ksize,           \
        AES_BLOCK_SIZE,                 \
        0 | EVP_CIPH_##umode##_MODE,    \
-       padlock_aes_init_key_##ksize,   \
+       padlock_aes_init_key,           \
        padlock_aes_cipher,             \
        NULL,                           \
-       sizeof(struct padlock_aes_key), \
+       sizeof(struct padlock_cipher_data) + 16,        \
        EVP_CIPHER_set_asn1_iv,         \
        EVP_CIPHER_get_asn1_iv,         \
        NULL,                           \
@@ -442,100 +453,72 @@
        return 1;
 }
 
-/* Generate an extended AES key in software. Needed for AES192/AES256 */
-static int
-padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key,
-                     const unsigned char *iv, int enc,
-                     int key_len)
+/* Our own htonl()/ntohl() */
+static inline void
+padlock_bswapl(uint32_t *arg)
 {
-       AES_KEY ks;
-       struct padlock_aes_key *tmp_aes_key = NULL;
-       int i;
-
-       tmp_aes_key = (struct padlock_aes_key *) (ctx->cipher_data);
-       memset(tmp_aes_key, 0, sizeof(struct padlock_aes_key));
-       if (key) {
-               if (enc)
-                       AES_set_encrypt_key(key, key_len, &ks);
-               else
-                       AES_set_decrypt_key(key, key_len, &ks);
-
-               /* OpenSSL internal functions use byte-swapped extended key. */
-               for (i = 0; i < AES_KEY_WORDS; i++)
-                       tmp_aes_key->aes_key[i] = htonl(ks.rd_key[i]);
-
-               tmp_aes_key->extended = 1;
-       }
-
-       return 1;
+       asm volatile ("bswapl %0" : "+r"(*arg));
 }
 
-/* PadLock can generate an extended key for AES128 in hardware */
+/* Prepare the encryption key for PadLock usage */
 static int
-padlock_aes_init_key_128(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-                         const unsigned char *iv, int enc)
+padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                     const unsigned char *iv, int enc)
 {
-       struct padlock_aes_key *tmp_aes_key = NULL;
+       struct padlock_cipher_data *cdata;
+       int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;
+       int i;
 
-       tmp_aes_key = (struct padlock_aes_key *) (ctx->cipher_data);
-       memset(tmp_aes_key, 0, sizeof(struct padlock_aes_key));
+       cdata = ALIGNED_CIPHER_DATA(ctx);
+       memset(cdata, 0, sizeof(struct padlock_cipher_data));
+       
+       /* Prepare Control word. */
+       cdata->cword.b.encdec = (ctx->encrypt == 0);
+       cdata->cword.b.rounds = 10 + (key_len - 128) / 32;
+       cdata->cword.b.ksize = (key_len - 128) / 64;
+
+       cdata->key = (uint8_t *)(cdata->ks.rd_key);
+       
        if (key) {
-               memcpy (tmp_aes_key->aes_key, key, AES_KEY_SIZE_128);
-               tmp_aes_key->extended = 0;
+               switch(key_len) {
+                       case 128:
+                               /* PadLock can generate an extended key for
+                                  AES128 in hardware */
+                               memcpy (cdata->key, key, AES_KEY_SIZE_128);
+                               cdata->cword.b.keygen = 0;
+                               break;
+
+                       case 192:
+                       case 256:
+                               /* Generate an extended AES key in software.
+                                  Needed for AES192/AES256 */
+                               if (enc)
+                                       AES_set_encrypt_key(key, key_len, &cdata->ks);
+                               else
+                                       AES_set_decrypt_key(key, key_len, &cdata->ks);
+
+                               /* OpenSSL internal functions use byte-swapped 
extended key. */
+                               for (i = 0; i < AES_KEY_WORDS; i++)
+                                       padlock_bswapl((uint32_t 
*)&(cdata->ks.rd_key[i]));
+
+                               cdata->cword.b.keygen = 1;
+                               break;
+
+                       default:
+                               /* ERROR */
+                               return 0;
+               }
        }
 
        return 1;
 }
 
-static int
-padlock_aes_init_key_192(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-                         const unsigned char *iv, int enc)
-{
-       return padlock_aes_init_key(ctx, key, iv, enc, 192);
-}
-
-static int
-padlock_aes_init_key_256(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-                         const unsigned char *iv, int enc)
-{
-       return padlock_aes_init_key(ctx, key, iv, enc, 256);
-}
-
-/* Data for the VIA PadLock *must* be aligned on 16-Bytes boundaries! */
-static void *
-padlock_aligned_malloc(size_t size, size_t alignment, void **index)
-{
-       char *ptr;
-
-       ptr = malloc(size + alignment);
-       *index = ptr;
-       if (alignment > 1 && ((long)ptr & (alignment - 1))) {
-               ptr += alignment - ((long)ptr & (alignment - 1));
-       }
-
-       return ptr;
-}
-
-/* Control word. */
-union cword {
-       uint32_t cword[4];
-       struct {
-               int rounds:4;
-               int algo:3;
-               int keygen:1;
-               int interm:1;
-               int encdec:1;
-               int ksize:2;
-       } b;
-};
-
 /* Template for all modes */
 #define PADLOCK_XCRYPT_ASM(name,opcode) \
 static inline void name(uint8_t *input, uint8_t *output, uint8_t *key, \
                        uint8_t **iv, void *control_word, uint32_t count)       \
 {                                                                              \
-       asm volatile ("pushfl; popfl\n"                                         \
-                     "pushl %%ebx\n"                                   \
+       asm volatile ("pushl %%ebx\n"                                   \
                      "movl %%eax, %%ebx\n"                                     \
                      "movl %0, %%eax\n"                                        \
                      "movl (%%eax), %%eax\n"                                   \
@@ -555,126 +538,112 @@
 
 /* Re-align the arguments to 16-Bytes boundaries and run the 
    encryption function itself. This function is not AES-specific. */
-static inline void
-padlock_aligner(uint8_t *out_arg, const uint8_t *in_arg, uint8_t *iv_arg,
-               void *key, union cword *cword, size_t nbytes,
-               size_t blocksize, int mode)
-{
-       /* Don't blindly modify this structure - the items must 
-          be 16-Bytes aligned! */
-       struct padlock_xcrypt_data {
-               uint8_t iv[2*blocksize];                /* Initialization vector */
+static int
+padlock_aes_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out_arg, const uint8_t *in_arg, 
+                  unsigned int nbytes)
+{
+       struct padlock_realign_buffer {
+               #define REALIGN_SIZE    4096    /* Must be a multiple of 16! */
+               uint8_t buf[REALIGN_SIZE];      /* Buffer for data realignmentation */
        };
 
+       struct padlock_cipher_data *cdata = ALIGNED_CIPHER_DATA(ctx);
        uint8_t *in, *out, *iv;
-       void *index = NULL;
-       char bigbuf[sizeof(struct padlock_xcrypt_data) + 16];
-       struct padlock_xcrypt_data *data;
+       char bigbuf[sizeof(struct padlock_realign_buffer) + 16];
+       struct padlock_realign_buffer *realign;
+       int in_arg_aligned, out_arg_aligned, realign_in_loop = 0;
+       size_t realigned_bytes, blocksize = AES_BLOCK_SIZE;
        
-       memset(bigbuf, 0, sizeof (bigbuf));
+       /* Force key reload.
+          TODO: ... only if the context is different from the last one. */
+       asm volatile ("pushf; popf");
 
-       /* Place 'data' at the first 16-Bytes aligned address in 'bigbuf' */
-       if (((long)bigbuf) & 0x0F)
-               data = (void*)(bigbuf + 16 - ((long)bigbuf & 0x0F));
-       else
-               data = (void*)bigbuf;
+       /* Place 'realign' at the first 16-Bytes aligned address in 'bigbuf' */
+       realign = (void*)(bigbuf + ((16 - ((size_t)bigbuf & 0x0F)) & 0x0F));
 
        /* Always make a local copy of IV - xcrypt may change it! */
-       iv = data->iv;
-       if (iv_arg)
-               memcpy(iv, iv_arg, blocksize);
-
-       /* Align 'in_arg' */
-       if (((long)in_arg) & 0x0F) {
-               in = padlock_aligned_malloc(nbytes, 16, &index);
-               memcpy(in, in_arg, nbytes);
-       }
-       else
-               in = (uint8_t*)in_arg;
+       iv = cdata->iv;
+       if (ctx->iv)
+               memcpy(iv, ctx->iv, blocksize);
+
+       /* Set in/out buffers depending on their alignment. */
+       in_arg_aligned = !(((size_t)in_arg) & 0x0F);
+       out_arg_aligned = !(((size_t)out_arg) & 0x0F);
 
-       /* Align 'out_arg' */
-       if (((long)out_arg) & 0x0F) {
-               if (index)
-                       out = in;       /* xcrypt can work "in place" */
+       if (in_arg_aligned) {
+               in = (uint8_t*)in_arg;
+               if (out_arg_aligned)
+                       out = out_arg;
                else
-                       out = padlock_aligned_malloc(nbytes, 16, &index);
-       }
-       else
-               out = out_arg;
-
-       /* Run xcrypt for a requested mode */
-       switch (mode) {
-               case EVP_CIPH_ECB_MODE:
-                       padlock_xcrypt_ecb(in, out, key, &iv, cword, nbytes/blocksize);
-                       break;
-
-               case EVP_CIPH_CBC_MODE:
-                       padlock_xcrypt_cbc(in, out, key, &iv, cword, nbytes/blocksize);
-                       break;
-
-               case EVP_CIPH_CFB_MODE:
-                       padlock_xcrypt_cfb(in, out, key, &iv, cword, nbytes/blocksize);
-                       break;
-
-               case EVP_CIPH_OFB_MODE:
-                       padlock_xcrypt_ofb(in, out, key, &iv, cword, nbytes/blocksize);
-                       break;
-
-               default:
-                       break;
-       }
-
-       /* Copy the 16-Byte aligned output to the caller's buffer. */
-       if (out != out_arg)
-               memcpy(out_arg, out, nbytes);
+                       out = in;
+       } else {
+               if (out_arg_aligned) {
+                       memcpy(out_arg, in_arg, nbytes);
+                       in = out_arg;   /* xcrypt "in place" */
+                       out = in;
+               } else {
+                       /* Tough but most common case - nothing is 
+                          aligned. We will re-align the input data 
+                          by parts in a loop. Each part should be 
+                          small enough to fit in the L1 D-cache 
+                          (hope that it helps a bit :-) */
+                       realign_in_loop = 1;
+                       realigned_bytes = 0;
+                       in = realign->buf;
+                       out = realign->buf;
+               }
+       }
+
+       /* Main encryption loop */
+       do {
+               size_t current_nbytes;
+               /* Realign the data if needed */
+               if (realign_in_loop) {
+                       current_nbytes = nbytes < REALIGN_SIZE ? nbytes : REALIGN_SIZE;
+                       memcpy(in, in_arg + realigned_bytes, current_nbytes);
+               } else
+                       current_nbytes = nbytes;
+
+               /* Run xcrypt for a requested mode */
+               switch (EVP_CIPHER_CTX_mode(ctx)) {
+                       case EVP_CIPH_ECB_MODE:
+                               padlock_xcrypt_ecb(in, out, cdata->key, &iv, 
&cdata->cword, current_nbytes/blocksize);
+                               break;
+
+                       case EVP_CIPH_CBC_MODE:
+                               padlock_xcrypt_cbc(in, out, cdata->key, &iv, 
&cdata->cword, current_nbytes/blocksize);
+                               break;
+
+                       case EVP_CIPH_CFB_MODE:
+                               padlock_xcrypt_cfb(in, out, cdata->key, &iv, 
&cdata->cword, current_nbytes/blocksize);
+                               break;
+
+                       case EVP_CIPH_OFB_MODE:
+                               padlock_xcrypt_ofb(in, out, cdata->key, &iv, 
&cdata->cword, current_nbytes/blocksize);
+                               break;
+
+                       default:
+                               break;
+               }
+
+               /* Save the result */
+               if (realign_in_loop) {
+                       memcpy(out_arg + realigned_bytes, out, current_nbytes);
+                       nbytes -= current_nbytes;
+                       realigned_bytes += current_nbytes;
+               } else {
+                       /* Copy the 16-Byte aligned output to the caller's buffer. */
+                       if (out != out_arg)
+                               memcpy(out_arg, out, nbytes);
+               }
+       } while (realign_in_loop && nbytes);
+
+       /* Clean the realign buffer if it was used */
+       if (realign_in_loop)
+               memset(in, 0, REALIGN_SIZE);
 
        /* Save modified IV */
-       memcpy (iv_arg, iv, blocksize);
-
-       if (index)
-               free(index);
-
-}
-
-/* Entry point specific for AES cipher */
-static int
-padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
-                  const unsigned char *in, unsigned int in_len)
-{
-       char bigbuf[sizeof(union cword) + 16];
-       union cword *cword;
-       void *key_arg, *key, *iv, *index_key=NULL;
-       int mode = EVP_CIPHER_CTX_mode(ctx);
-       int key_len = EVP_CIPHER_CTX_key_length(ctx);
-       struct padlock_aes_key *aes_key = (struct padlock_aes_key *) 
(ctx->cipher_data);
-
-       /* Place 'data' at the first 16-Bytes aligned address in 'bigbuf'. */
-       if (((long)bigbuf) & 0x0F)
-               cword = (void*)(bigbuf + 16 - ((long)bigbuf & 0x0F));
-       else
-               cword = (void*)bigbuf;
-
-       /* Prepare Control word. */
-       memset (cword, 0, sizeof(union cword));
-       cword->b.encdec = (ctx->encrypt == 0);
-       cword->b.rounds = 10 + (key_len - 16) / 4;
-       cword->b.ksize = (key_len - 16) / 8;
-       cword->b.keygen = aes_key->extended;
-
-       key_arg = aes_key->aes_key;
-       if ((long)key_arg & 0x0f) {
-               key = padlock_aligned_malloc(AES_KEY_BYTES, 16, &index_key);
-               memcpy (key, key_arg, AES_KEY_BYTES);
-       } else
-               key = key_arg;
-
-       iv = ctx->iv;
-
-       padlock_aligner(out, in, iv, key, cword,
-                       in_len, AES_BLOCK_SIZE, mode);
-
-       if (index_key)
-               free(index_key);
+       memcpy (ctx->iv, iv, blocksize);
 
        return 1;
 }

Reply via email to