Hi all,

the attached patch adds SHA1 support for VIA PadLock engine.

There are several design decisions that I may need to explain:

The xsha1 instruction always finalizes the MD computation, i.e. it is
not possible to call the hardware in sha1_update() with the provided
input buffer. Instead its necessary to accumulate all data from
update()s in some buffer and hash them only in final().

The imminent problem here is the buffer size. For obvious reasons we
can't let it grow indefinitely. Instead there is a set maximum size (8k
in my patch) after which the engine falls back to software SHA1, pushes
all accumulated data to its update() method and never touches the hardware.

The context structure looks like this:

struct padlock_digest_data {
        void            *buf_start, *buf_alloc;
        ssize_t         used;
        unsigned long   order:8, bypass:1;
        SHA_CTX         fallback_ctx;
};

In padlock_init() I allocate a buffer of a given size (8k as well) whose
first 16B-aligned address goes to buf_start. Having the input data
aligned allows PadLock crunch them faster.

I did some experiments with having a statically allocated buffer in this
structure (to avoid malloc() for small datasets), but it actually made
things slower. malloc() appears to be fast enough.

And yes, some numbers :-) These are from VIA Esther 1.2GHz:

type          16 bytes  64 bytes   256 bytes  1024 bytes   8192 bytes
sha1-sw       4108.58k  12323.46k  28142.76k   41445.20k    48078.85k
sha1-padlock  3321.16k  12656.98k  44152.58k  116508.25k   224807.59k

Please comment or commit.

Thanks!

Michal Ludvig
-- 
* Personal homepage: http://www.logix.cz/michal



Index: openssl-0.9.8-O2/crypto/engine/eng_padlock.c
===================================================================
--- openssl-0.9.8-O2.orig/crypto/engine/eng_padlock.c
+++ openssl-0.9.8-O2/crypto/engine/eng_padlock.c
@@ -74,11 +74,22 @@
 #ifndef OPENSSL_NO_AES
 #include <openssl/aes.h>
 #endif
+#ifndef OPENSSL_NO_SHA
+#include <openssl/sha.h>
+#endif
 #include <openssl/rand.h>
 
 #ifndef OPENSSL_NO_HW
 #ifndef OPENSSL_NO_HW_PADLOCK
 
+/* PadLock RNG is disabled by default */
+#define        PADLOCK_NO_RNG  1
+
+/* No ASM routines for SHA in MSC yet */
+#ifdef _MSC_VER
+#define OPENSSL_NO_SHA
+#endif
+
 /* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */
 #if (OPENSSL_VERSION_NUMBER >= 0x00908000L)
 #  ifndef OPENSSL_NO_DYNAMIC_ENGINE
@@ -134,20 +145,41 @@ static int padlock_available(void);
 static int padlock_init(ENGINE *e);
 
 /* RNG Stuff */
+#ifndef PADLOCK_NO_RNG
 static RAND_METHOD padlock_rand;
+#endif
 
 /* Cipher Stuff */
 #ifndef OPENSSL_NO_AES
 static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int 
**nids, int nid);
 #endif
 
+/* Digest Stuff */
+#ifndef OPENSSL_NO_SHA
+static int padlock_digests(ENGINE *e, const EVP_MD **digest, const int **nids, 
int nid);
+#endif
+
 /* Engine names */
 static const char *padlock_id = "padlock";
 static char padlock_name[100];
 
 /* Available features */
-static int padlock_use_ace = 0;        /* Advanced Cryptography Engine */
-static int padlock_use_rng = 0;        /* Random Number Generator */
+enum padlock_flags {
+       PADLOCK_RNG  = 0x01,
+       PADLOCK_ACE  = 0x02,
+       PADLOCK_ACE2 = 0x04,
+       PADLOCK_PHE  = 0x08,
+       PADLOCK_PMM  = 0x10
+};
+enum padlock_flags padlock_flags;
+
+#define PADLOCK_HAVE_RNG  (padlock_flags & PADLOCK_RNG)
+#define PADLOCK_HAVE_ACE  (padlock_flags & (PADLOCK_ACE|PADLOCK_ACE2))
+#define PADLOCK_HAVE_ACE1 (padlock_flags & PADLOCK_ACE)
+#define PADLOCK_HAVE_ACE2 (padlock_flags & PADLOCK_ACE2)
+#define PADLOCK_HAVE_PHE  (padlock_flags & PADLOCK_PHE)
+#define PADLOCK_HAVE_PMM  (padlock_flags & PADLOCK_PMM)
+
 #ifndef OPENSSL_NO_AES
 static int padlock_aes_align_required = 1;
 #endif
@@ -161,25 +193,30 @@ padlock_bind_helper(ENGINE *e)
        /* Check available features */
        padlock_available();
 
-#if 1  /* disable RNG for now, see commentary in vicinity of RNG code */
-       padlock_use_rng=0;
-#endif
-
        /* Generate a nice engine name with available features */
        BIO_snprintf(padlock_name, sizeof(padlock_name),
-               "VIA PadLock (%s, %s)", 
-                padlock_use_rng ? "RNG" : "no-RNG",
-                padlock_use_ace ? "ACE" : "no-ACE");
+               "VIA PadLock: %s%s%s%s%s", 
+                padlock_flags ? "" : "not supported",
+                PADLOCK_HAVE_RNG ? "RNG " : "",
+                PADLOCK_HAVE_ACE ? (PADLOCK_HAVE_ACE2 ? "ACE2 " : "ACE ") : "",
+                PADLOCK_HAVE_PHE ? "PHE " : "",
+                PADLOCK_HAVE_PMM ? "PMM " : "");
 
        /* Register everything or return with an error */ 
        if (!ENGINE_set_id(e, padlock_id) ||
            !ENGINE_set_name(e, padlock_name) ||
 
-           !ENGINE_set_init_function(e, padlock_init) ||
+           !ENGINE_set_init_function(e, padlock_init)
 #ifndef OPENSSL_NO_AES
-           (padlock_use_ace && !ENGINE_set_ciphers (e, padlock_ciphers)) ||
+           || (PADLOCK_HAVE_ACE && !ENGINE_set_ciphers (e, padlock_ciphers))
+#endif
+#ifndef OPENSSL_NO_SHA
+           || (PADLOCK_HAVE_PHE && !ENGINE_set_digests (e, padlock_digests))
+#endif
+#ifndef PADLOCK_NO_RNG
+           || (PADLOCK_HAVE_RNG && !ENGINE_set_RAND (e, &padlock_rand))
 #endif
-           (padlock_use_rng && !ENGINE_set_RAND (e, &padlock_rand))) {
+           ) {
                return 0;
        }
 
@@ -209,7 +246,7 @@ ENGINE_padlock(void)
 static int
 padlock_init(ENGINE *e)
 {
-       return (padlock_use_rng || padlock_use_ace);
+       return (padlock_flags);
 }
 
 /* This stuff is needed if this ENGINE is being compiled into a self-contained
@@ -236,6 +273,17 @@ IMPLEMENT_DYNAMIC_BIND_FN (padlock_bind_
 
 /* ===== Here comes the "real" engine ===== */
 
+#ifdef __GNUC__
+#define likely(x)       __builtin_expect(!!(x), 1)
+#define unlikely(x)     __builtin_expect(!!(x), 0)
+#else
+#define likely(x)       (x)
+#define unlikely(x)     (x)
+#endif
+
+/* How to test if we need to typedef uint32_t ??? */
+typedef unsigned long uint32_t;
+
 #ifndef OPENSSL_NO_AES
 /* Some AES-related constants */
 #define AES_BLOCK_SIZE         16
@@ -359,10 +407,22 @@ padlock_available(void)
                : "+a"(eax), "=d"(edx) : : "ecx");
 
        /* Fill up some flags */
-       padlock_use_ace = ((edx & (0x3<<6)) == (0x3<<6));
-       padlock_use_rng = ((edx & (0x3<<2)) == (0x3<<2));
+       padlock_flags |= ((edx & (0x3<<3)) ? PADLOCK_RNG : 0);
+       padlock_flags |= ((edx & (0x3<<7)) ? PADLOCK_ACE : 0);
+       padlock_flags |= ((edx & (0x3<<9)) ? PADLOCK_ACE2 : 0);
+       padlock_flags |= ((edx & (0x3<<11)) ? PADLOCK_PHE : 0);
+       padlock_flags |= ((edx & (0x3<<13)) ? PADLOCK_PMM : 0);
 
-       return padlock_use_ace + padlock_use_rng;
+       return padlock_flags;
+}
+
+static inline void
+padlock_htonl_block(uint32_t *data, size_t count)
+{
+       while (count--) {
+               asm volatile ("bswapl %0" : "+r"(*data));
+               data++;
+       }
 }
 
 #ifndef OPENSSL_NO_AES
@@ -371,12 +431,9 @@ static inline void
 padlock_bswapl(AES_KEY *ks)
 {
        size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]);
-       unsigned int *key = ks->rd_key;
+       uint32_t *key = (uint32_t*) ks->rd_key;
 
-       while (i--) {
-               asm volatile ("bswapl %0" : "+r"(*key));
-               key++;
-       }
+       padlock_htonl_block(key, i);
 }
 #endif
 
@@ -1047,6 +1104,250 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, 
 
 #endif /* OPENSSL_NO_AES */
 
+#ifndef OPENSSL_NO_SHA
+
+#define PADLOCK_SHA_INIT_ORD   13      /* = 8192 */
+#define PADLOCK_SHA_MAX_ORD    13      /* = 8192 */
+
+// #define PADLOCK_SHA_STAT 1
+
+/* Don't forget to initialize all relevant 
+ * fields in padlock_sha_init() or face the
+ * consequences!!! 
+ * BTW We don't use bzero() on this structure
+ *     because zeroing fallback_ctx is
+ *     a waste of time. */
+struct padlock_digest_data {
+       SHA_CTX         fallback_ctx;
+       void            *buf_start, *buf_alloc;
+       ssize_t         used;
+       unsigned long   order:8, bypass:1;
+#ifdef PADLOCK_SHA_STAT
+       size_t          stat_count, stat_total;
+#endif
+};
+
+#ifdef PADLOCK_SHA_STAT
+size_t all_count, all_total;
+#endif
+
+#define DIGEST_DATA(ctx) ((struct padlock_digest_data *)(ctx->md_data))
+#define DDATA_FREE(ddata) ((size_t)(1L << ddata->order) - ddata->used)
+
+static void
+padlock_sha_bypass(struct padlock_digest_data *ddata)
+{
+       if (ddata->bypass)
+               return;
+
+       SHA1_Init(&ddata->fallback_ctx);
+       if (ddata->buf_start && ddata->used > 0) {
+               SHA1_Update(&ddata->fallback_ctx, ddata->buf_start, 
ddata->used);
+               if (ddata->buf_alloc) {
+                       free(ddata->buf_alloc);
+                       ddata->buf_alloc = 0;
+               }
+       }
+       ddata->buf_start = 0;
+       ddata->used = 0;
+       ddata->bypass = 1;
+
+       return;
+}
+
+static inline void
+padlock_do_sha1(char *in, char *out, int count)
+{
+       /* We can't store directly to *out as it 
+        * doesn't have to be aligned. But who cares, 
+        * it's only a few bytes... */
+       char buf[128+16];
+       char *output = NEAREST_ALIGNED(buf);
+
+       ((uint32_t*)output)[0] = 0x67452301;
+       ((uint32_t*)output)[1] = 0xEFCDAB89;
+       ((uint32_t*)output)[2] = 0x98BADCFE;
+       ((uint32_t*)output)[3] = 0x10325476;
+       ((uint32_t*)output)[4] = 0xC3D2E1F0;
+
+       asm volatile (".byte 0xf3,0x0f,0xa6,0xc8"       /* rep xsha1 */
+                     : "+S"(in), "+D"(output)
+                     : "c"(count), "a"(0));
+
+       memcpy(out, output, 5 * sizeof(uint32_t));
+
+       padlock_htonl_block((uint32_t*)out, 5);
+}
+
+static int
+padlock_sha_init(EVP_MD_CTX *ctx)
+{
+       struct padlock_digest_data *ddata = DIGEST_DATA(ctx);
+
+       ddata->used = 0;
+       ddata->bypass = 0;
+
+       ddata->order = PADLOCK_SHA_INIT_ORD;
+       ddata->buf_alloc = malloc((1L << ddata->order) + 16);
+       ddata->buf_start = NEAREST_ALIGNED(ddata->buf_alloc);
+
+       return 1;
+}
+
+static int
+padlock_sha_update(EVP_MD_CTX *ctx, const void *data, size_t length)
+{
+       struct padlock_digest_data *ddata = DIGEST_DATA(ctx);
+
+#ifdef PADLOCK_SHA_STAT
+       ddata->stat_count++;
+       ddata->stat_total += length;
+       all_count++;
+       all_total += length;
+#endif
+       if (unlikely(ddata->bypass)) {
+               SHA1_Update(&ddata->fallback_ctx, data, length);
+               return 1;
+       }
+       if (unlikely(DDATA_FREE(ddata) < length)) {
+               if (likely(ddata->used + length > (1 << PADLOCK_SHA_MAX_ORD))) {
+                       /* Too much data to be stored -> bypass to SW SHA */
+                       padlock_sha_bypass(ddata);
+                       SHA1_Update(&ddata->fallback_ctx, data, length);
+                       return 1;
+               } else {
+                       /* Resize the alocated buffer */
+                       char *new_buf;
+                       size_t new_size;
+
+                       while ((1<<++ddata->order) < (ddata->used + length));
+                       new_size = (1<<ddata->order);
+                       if(!(new_buf = realloc(ddata->buf_alloc, new_size + 
16))) {
+                               /* fallback plan again */
+                               padlock_sha_bypass(ddata);
+                               SHA1_Update(&ddata->fallback_ctx, data, length);
+                               return 1;
+                       }
+                       ddata->buf_alloc = new_buf;
+                       ddata->buf_start = NEAREST_ALIGNED(new_buf);
+               }
+       }
+
+       memcpy(ddata->buf_start + ddata->used, data, length);
+       ddata->used += length;
+
+       return 1;
+}
+
+static int
+padlock_sha_final(EVP_MD_CTX *ctx, unsigned char *md)
+{
+       struct padlock_digest_data *ddata = DIGEST_DATA(ctx);
+
+#ifdef PADLOCK_SHA_STAT
+       fprintf(stderr, "PadLock CTX: cnt=%zu, tot=%zu, avg=%zu\n",
+               ddata->stat_count, ddata->stat_total,
+               ddata->stat_count ? (ddata->stat_total/ddata->stat_count) : 0);
+       fprintf(stderr, "PadLock ALL: cnt=%zu, tot=%zu, avg=%zu\n",
+               all_count, all_total, all_count ? (all_total/all_count) : 0);
+#endif
+
+       if (ddata->bypass) {
+               SHA1_Final(md, &ddata->fallback_ctx);
+               return 1;
+       }
+       
+       /* Pass the input buffer to PadLock microcode... */
+       padlock_do_sha1(ddata->buf_start, md, ddata->used);
+       free(ddata->buf_alloc);
+       ddata->buf_start = 0;
+       ddata->buf_alloc = 0;
+       ddata->used = 0;
+
+       return 1;
+}
+
+static int
+padlock_sha_copy(EVP_MD_CTX *to,const EVP_MD_CTX *from)
+{
+       struct padlock_digest_data *ddata_from = DIGEST_DATA(from);
+       struct padlock_digest_data *ddata_to = DIGEST_DATA(to);
+       
+       memcpy(ddata_to, ddata_from, sizeof(struct padlock_digest_data));
+       if (ddata_from->buf_alloc) {
+               ddata_to->buf_alloc = malloc(1L << ddata_to->order);
+               if (!ddata_to->buf_start) {
+                       fprintf(stderr, "%s(): malloc() failed\n", __func__);
+                       exit(1);
+               }
+               ddata_to->buf_start = NEAREST_ALIGNED(ddata_to->buf_alloc);
+               memcpy(ddata_to->buf_start, ddata_from->buf_start, 
ddata_from->used);
+       }
+       return 1;
+}
+
+static int
+padlock_sha_cleanup(EVP_MD_CTX *ctx)
+{
+       struct padlock_digest_data *ddata = DIGEST_DATA(ctx);
+
+       if (ddata->buf_alloc)
+               free(ddata->buf_alloc);
+
+       memset(ddata, 0, sizeof(struct padlock_digest_data));
+
+       return 1;
+}
+
+static const EVP_MD padlock_sha1_md = {
+       NID_sha1,
+       NID_sha1WithRSAEncryption,
+       SHA_DIGEST_LENGTH,
+       0,
+       padlock_sha_init,
+       padlock_sha_update,
+       padlock_sha_final,
+       padlock_sha_copy,
+       padlock_sha_cleanup,
+       EVP_PKEY_RSA_method,
+       SHA_CBLOCK,
+       sizeof(struct padlock_digest_data),
+};
+
+static int padlock_digest_nids[] = {
+       NID_sha1,
+//     NID_sha256
+};
+
+static int padlock_digest_nids_num = 
sizeof(padlock_digest_nids)/sizeof(padlock_digest_nids[0]);
+
+static int
+padlock_digests (ENGINE *e, const EVP_MD **digest, const int **nids, int nid)
+{
+       /* No specific digest => return a list of supported nids ... */
+       if (!digest) {
+               *nids = padlock_digest_nids;
+               return padlock_digest_nids_num;
+       }
+
+       /* ... or the requested "digest" otherwise */
+       switch (nid) {
+         case NID_sha1:
+           *digest = &padlock_sha1_md;
+           break;
+
+         default:
+           /* Sorry, we don't support this NID */
+           *digest = NULL;
+           return 0;
+       }
+
+       return 1;
+}
+
+#endif /* OPENSSL_NO_SHA */
+
+#ifndef PADLOCK_NO_RNG
 /* ===== Random Number Generator ===== */
 /*
  * This code is not engaged. The reason is that it does not comply
@@ -1102,6 +1403,7 @@ static RAND_METHOD padlock_rand = {
        padlock_rand_bytes,     /* pseudorand */
        padlock_rand_status,    /* rand status */
 };
+#endif /* PADLOCK_NO_RNG */
 
 #endif /* COMPILE_HW_PADLOCK */
 

Reply via email to