/*----------------------------------------------------------------------------
// Licensed materials - Property of IBM                                      
//
// (C) Copyright IBM Corp.  2007,2009
// This code was donated to the OpenSSL project under the terms of the 
// OpenSSL license.
//
// Table driven GCM acceleration add by Aaron Cristensen November 2007.
// Fix bug reported by Paul Suhler
// Fix bug reported by Vince Hendricks
// Fix problem which prevents key reuse
//---------------------------------------------------------------------------*/


#ifndef AES_DEBUG
# ifndef NDEBUG
#  define NDEBUG
# endif
#endif

#include <string.h>

#include "openssl/evp.h"
#include "openssl/aes.h"
#include "openssl/rand.h"

#if !defined(AES_BLOCK_SIZE)
#define AES_BLOCK_SIZE 16
#endif

#include "aes_gcm.h"


/*
\debug Define: Uncomment to enable debugging of the AES-GCM code
#define AES_GCM_DEBUG
*/

#if defined(AES_GCM_DEBUG)
/* Used for printing tags in debug mode */
static int Xc = 0;
static int Yc = 0;
static int Nc = 0;

/*! @brief print a tagged binary buffer
    @param tag the tag
    @param buf the buffer to convert to hex and print
    @param blen the length of the buffer
    \debug Code: printbin: controlled by AES_GCM_DEBUG
*/

static void printbin(char *tag,unsigned char *buf, int blen)
{
  int i;
  printf("%-20s: ",tag);
  for(i = 0; i < blen; i++) {
    printf("%02x",(unsigned)buf[i]);
  }
  printf("\n");
}
static void printY(unsigned char *buf, int blen)
{
  int i;
  printf("Y_%1d%17s: ",Yc,"");
  for(i = 0; i < blen; i++) {
    printf("%02x",(unsigned)buf[i]);
  }
  printf("\n");
}
static void printX(unsigned char *buf, int blen)
{
  int i;
  printf("X_%1d%17s: ",Xc++,"");
  for(i = 0; i < blen; i++) {
    printf("%02x",(unsigned)buf[i]);
  }
  printf("\n");
}
static void printN(unsigned char *buf, int blen)
{
  int i;
  printf("N_%1d%17s: ",Nc++,"");
  for(i = 0; i < blen; i++) {
    printf("%02x",(unsigned)buf[i]);
  }
  printf("\n");
}
static void printEY(unsigned char *buf, int blen)
{
  int i;
  printf("E(K,Y_%1d)%12s: ",Yc++,"");
  for(i = 0; i < blen; i++) {
    printf("%02x",(unsigned)buf[i]);
  }
  printf("\n");
}



#else
#define printbin(x,y,z) 
#define printX(x,y)
#define printY(x,y)
#define printN(x,y)
#define printEY(x,y)
#endif

static const unsigned char zero[AES_BLOCK_SIZE];

/** @brief convert the length fields to big endian byte streams 
    @param l the incoming unsigned long, assumed to be 8 byte 
    FIX THIS!
    @param b (8 byte) output buffer
*/
static void ULongToChar(unsigned long l, unsigned char *b)
{
    int i;

    for(i = 7; i >= 0; i--) {
    	b[i] = l & 0xff;
    	l /= 256;
    }
}


/*! @brief increment a big endian counter
  @param counter pointer to the base of the counter block
  @param n number of bytes in the counter
  @note: the IV/counter CTR mode is big-endian.  The rest of the AES code
  is endian-neutral.
  increment counter (32 bit unsigned int
  rightmost 32 bits of a 128 bit block ) by 1 
*/
static void AES_gcm_inc(unsigned char *counter, int n) {
  int i;

  for(i = (n - 1); i >= 0; i--) {
    counter[i]++;
    if(0 != counter[i]) break;
  }
}





/** @brief xor two buffers into a destination.
    (Which may be one of the source buffers )
    @param dest the destination buffer
    @param s1 first source buffer
    @param s2 second source buffer
    @param blen buffer length
*/

static void xor(unsigned char *dest, unsigned char *s1, unsigned char *s2, unsigned blen)
{
    unsigned int i;
    for(i = 0; i < blen; i++) {
	dest[i] = s1[i] ^ s2[i];
    }
}

/** @brief right shift a buffer (assumed to be a big endian byte stream)
    @param buf  source buffer
    @param blen buffer length
*/
static void shr(unsigned char *buf,unsigned blen)
{
    unsigned cry;
    unsigned crynxt;
    unsigned i;

    if(blen != 0) {
      for(i = cry = 0; i < blen ; i++) {
	crynxt = 0;
	if(buf[i] & 1) {
	  crynxt = 0x80;
	}
	buf[i] = ((buf[i] >> 1) & 0x7f) | cry;
	cry = crynxt;
      }
    }
}
#if 0
/** @brief left shift a buffer (assumed to be a big endian byte stream)
    @param buf  source buffer
    @param blen buffer length
*/
static void shl(unsigned char *buf,unsigned blen)
{
    unsigned cry;
    unsigned crynxt;
    int i;

    for(i = blen-1,cry = 0; blen >0 && i >=0 ; i--) {
      crynxt = 0;
      if(buf[i] & 0x80) {
	crynxt = 0x01;
      }
      buf[i] = (buf[i] << 1)  | cry;
      cry = crynxt;
    }
}
#endif

static unsigned char rem_table_8bit[256][2] = {
    {0x00,0x00}, {0x01,0xC2}, {0x03,0x84}, {0x02,0x46}, {0x07,0x08}, {0x06,0xCA},
    {0x04,0x8C}, {0x05,0x4E}, {0x0E,0x10}, {0x0F,0xD2}, {0x0D,0x94}, {0x0C,0x56},
    {0x09,0x18}, {0x08,0xDA}, {0x0A,0x9C}, {0x0B,0x5E}, {0x1C,0x20}, {0x1D,0xE2},
    {0x1F,0xA4}, {0x1E,0x66}, {0x1B,0x28}, {0x1A,0xEA}, {0x18,0xAC}, {0x19,0x6E},
    {0x12,0x30}, {0x13,0xF2}, {0x11,0xB4}, {0x10,0x76}, {0x15,0x38}, {0x14,0xFA},
    {0x16,0xBC}, {0x17,0x7E}, {0x38,0x40}, {0x39,0x82}, {0x3B,0xC4}, {0x3A,0x06},
    {0x3F,0x48}, {0x3E,0x8A}, {0x3C,0xCC}, {0x3D,0x0E}, {0x36,0x50}, {0x37,0x92},
    {0x35,0xD4}, {0x34,0x16}, {0x31,0x58}, {0x30,0x9A}, {0x32,0xDC}, {0x33,0x1E},
    {0x24,0x60}, {0x25,0xA2}, {0x27,0xE4}, {0x26,0x26}, {0x23,0x68}, {0x22,0xAA},
    {0x20,0xEC}, {0x21,0x2E}, {0x2A,0x70}, {0x2B,0xB2}, {0x29,0xF4}, {0x28,0x36},
    {0x2D,0x78}, {0x2C,0xBA}, {0x2E,0xFC}, {0x2F,0x3E}, {0x70,0x80}, {0x71,0x42},
    {0x73,0x04}, {0x72,0xC6}, {0x77,0x88}, {0x76,0x4A}, {0x74,0x0C}, {0x75,0xCE},
    {0x7E,0x90}, {0x7F,0x52}, {0x7D,0x14}, {0x7C,0xD6}, {0x79,0x98}, {0x78,0x5A},
    {0x7A,0x1C}, {0x7B,0xDE}, {0x6C,0xA0}, {0x6D,0x62}, {0x6F,0x24}, {0x6E,0xE6},
    {0x6B,0xA8}, {0x6A,0x6A}, {0x68,0x2C}, {0x69,0xEE}, {0x62,0xB0}, {0x63,0x72},
    {0x61,0x34}, {0x60,0xF6}, {0x65,0xB8}, {0x64,0x7A}, {0x66,0x3C}, {0x67,0xFE},
    {0x48,0xC0}, {0x49,0x02}, {0x4B,0x44}, {0x4A,0x86}, {0x4F,0xC8}, {0x4E,0x0A},
    {0x4C,0x4C}, {0x4D,0x8E}, {0x46,0xD0}, {0x47,0x12}, {0x45,0x54}, {0x44,0x96},
    {0x41,0xD8}, {0x40,0x1A}, {0x42,0x5C}, {0x43,0x9E}, {0x54,0xE0}, {0x55,0x22},
    {0x57,0x64}, {0x56,0xA6}, {0x53,0xE8}, {0x52,0x2A}, {0x50,0x6C}, {0x51,0xAE},
    {0x5A,0xF0}, {0x5B,0x32}, {0x59,0x74}, {0x58,0xB6}, {0x5D,0xF8}, {0x5C,0x3A},
    {0x5E,0x7C}, {0x5F,0xBE}, {0xE1,0x00}, {0xE0,0xC2}, {0xE2,0x84}, {0xE3,0x46},
    {0xE6,0x08}, {0xE7,0xCA}, {0xE5,0x8C}, {0xE4,0x4E}, {0xEF,0x10}, {0xEE,0xD2},
    {0xEC,0x94}, {0xED,0x56}, {0xE8,0x18}, {0xE9,0xDA}, {0xEB,0x9C}, {0xEA,0x5E},
    {0xFD,0x20}, {0xFC,0xE2}, {0xFE,0xA4}, {0xFF,0x66}, {0xFA,0x28}, {0xFB,0xEA},
    {0xF9,0xAC}, {0xF8,0x6E}, {0xF3,0x30}, {0xF2,0xF2}, {0xF0,0xB4}, {0xF1,0x76},
    {0xF4,0x38}, {0xF5,0xFA}, {0xF7,0xBC}, {0xF6,0x7E}, {0xD9,0x40}, {0xD8,0x82},
    {0xDA,0xC4}, {0xDB,0x06}, {0xDE,0x48}, {0xDF,0x8A}, {0xDD,0xCC}, {0xDC,0x0E},
    {0xD7,0x50}, {0xD6,0x92}, {0xD4,0xD4}, {0xD5,0x16}, {0xD0,0x58}, {0xD1,0x9A},
    {0xD3,0xDC}, {0xD2,0x1E}, {0xC5,0x60}, {0xC4,0xA2}, {0xC6,0xE4}, {0xC7,0x26},
    {0xC2,0x68}, {0xC3,0xAA}, {0xC1,0xEC}, {0xC0,0x2E}, {0xCB,0x70}, {0xCA,0xB2},
    {0xC8,0xF4}, {0xC9,0x36}, {0xCC,0x78}, {0xCD,0xBA}, {0xCF,0xFC}, {0xCE,0x3E},
    {0x91,0x80}, {0x90,0x42}, {0x92,0x04}, {0x93,0xC6}, {0x96,0x88}, {0x97,0x4A},
    {0x95,0x0C}, {0x94,0xCE}, {0x9F,0x90}, {0x9E,0x52}, {0x9C,0x14}, {0x9D,0xD6},
    {0x98,0x98}, {0x99,0x5A}, {0x9B,0x1C}, {0x9A,0xDE}, {0x8D,0xA0}, {0x8C,0x62},
    {0x8E,0x24}, {0x8F,0xE6}, {0x8A,0xA8}, {0x8B,0x6A}, {0x89,0x2C}, {0x88,0xEE},
    {0x83,0xB0}, {0x82,0x72}, {0x80,0x34}, {0x81,0xF6}, {0x84,0xB8}, {0x85,0x7A},
    {0x87,0x3C}, {0x86,0xFE}, {0xA9,0xC0}, {0xA8,0x02}, {0xAA,0x44}, {0xAB,0x86},
    {0xAE,0xC8}, {0xAF,0x0A}, {0xAD,0x4C}, {0xAC,0x8E}, {0xA7,0xD0}, {0xA6,0x12},
    {0xA4,0x54}, {0xA5,0x96}, {0xA0,0xD8}, {0xA1,0x1A}, {0xA3,0x5C}, {0xA2,0x9E},
    {0xB5,0xE0}, {0xB4,0x22}, {0xB6,0x64}, {0xB7,0xA6}, {0xB2,0xE8}, {0xB3,0x2A},
    {0xB1,0x6C}, {0xB0,0xAE}, {0xBB,0xF0}, {0xBA,0x32}, {0xB8,0x74}, {0xB9,0xB6},
    {0xBC,0xF8}, {0xBD,0x3A}, {0xBF,0x7C}, {0xBE,0xBE}
};

static unsigned char rem_table_4bit[16][2] = {
    {0x00,0x00}, {0x1C,0x20}, {0x38,0x40}, {0x24,0x60}, {0x70,0x80}, {0x6C,0xA0},
    {0x48,0xC0}, {0x54,0xE0}, {0xE1,0x00}, {0xFD,0x20}, {0xD9,0x40}, {0xC5,0x60},
    {0x91,0x80}, {0x8D,0xA0}, {0xA9,0xC0}, {0xB5,0xE0}
};


/*! @brief generates table n for each possible byte multipled by hkey.
    @param n which table to generate (the nth table)
    @param hkey the authentication key
    @param t the table will be generated in this variable
*/
static void GCM_TAB_gen_8bit_table_n(
				     int n,
				     unsigned char hkey[16],
				     unsigned char t[256][16])
{
  int i;
  int j;
  int overflow;
  int remi;

  /* Set t[128] = hkey since t[128] is the same as f(x) = 1.  That is,
     t[128] = f(x) * hkey = hkey.  Then, depending on which table this is,
     shift over by 8 bytes in the GF(2^128) field.  So, each time this loop
     iterates we have t[128] = t[128] * x^8 in GF(2^128). */
  i = n;
  memcpy(t[128], hkey, 16);
  while (i > 0) {
    remi = t[128][15];

    for (j = 15; j >= 1; j--) {
      t[128][j] = t[128][j - 1];
    }
    t[128][0] = rem_table_8bit[remi][0];
    t[128][1] ^= rem_table_8bit[remi][1];

    i--;
  }

  /* Then start figuring out the other indices.  If t[128] is 1, then
     t[64] is x, t[32] is x^2, t[16] is x^3, etc.  So, to find those
     indices we simply shift t[128] over by 1, 2, 3 bits. */
  i = 64;
  while (i > 0) {
    memcpy(t[i], t[i << 1], 16);
    overflow = t[i][15] & 0x01;
    shr(t[i], 16);
    if (overflow) {
      t[i][0] ^= 0xE1;
    }

    i >>= 1;
  }

  /* After figuring out all the powers of two up to t[128], we can simply add
     all these together to get the remaining indices. */
  i = 2;
  while (i < 256) {
    for (j = 1; j < i; j++) {
      xor(t[i + j], t[i], t[j], 16);
    }
    i <<= 1;
  }

  memset(t[0], 0x00, 16);
}

/*! @brief generates table n for each possible nibble multipled by hkey.
  @param n which table to generate (the nth table)
  @param hkey the authentication key
  @param t the table will be generated in this variable
*/
static void GCM_TAB_gen_4bit_table_n(
				     int n,
				     unsigned char hkey[16],
				     unsigned char t[16][16])
{
  int i;
  int j;
  int overflow;
  int remi;

  /* For the 4 bit tables, we're basically doing the same thing as the
     8 bit tables except at a 4 bit scale.  So, instead of t[128] being
     f(x) = 1, t[8] is f(x) = 1.  Everything else proceeds the same from
     there. */
  i = n;
  memcpy(t[8], hkey, 16);
  while (i > 0) {
    remi = t[8][15] & 0x0F;
    for (j = 31; j >= 1; j--) {
      if (j & 0x01) {
        t[8][j >> 1] = (t[8][j >> 1] >> 4) & 0x0F;
      } else {
        t[8][j >> 1] = t[8][j >> 1] | ((t[8][(j >> 1) - 1] << 4) & 0xF0);
      }
    }
    t[8][0] &= 0x0F;

    t[8][0] ^= rem_table_4bit[remi][0];
    t[8][1] ^= rem_table_4bit[remi][1];

    i--;
  }

  i = 4;
  while (i > 0) {
    memcpy(t[i], t[i << 1], 16);
    overflow = t[i][15] & 0x01;
    shr(t[i], 16);
    if (overflow) {
      t[i][0] ^= 0xE1;
    }

    i >>= 1;
  }

  i = 2;
  while (i < 16) {
    for (j = 1; j < i; j++) {
      xor(t[i + j], t[i], t[j], 16);
    }
    i <<= 1;
  }

  memset(t[0], 0x00, 16);
}

/*! @brief initialize the table for level 1 acceleration.
  This acceleration level maintains a table for all 16 possible values
  of the first nibble of the 8KB table.  That is, it only holds:
  x^0 + x^1 + x^2 + x^3 multipled by the hash key for all possible
  values (0x0-0xF).
  @param hkey the authentication key
  @param t the table that will be initialized
  @retval 0 if table initializaiton fails
  @retval 1 if table initialization succeeds
*/
static void GCM_TAB_init_256b(
			      unsigned char hkey[16],
			      unsigned char t[16][16])
{
  GCM_TAB_gen_4bit_table_n(0, hkey, t);
}

/*! @brief initialize the table for level 1 acceleration.
  This acceleration level maintains a table for all 256 possible values
  of the first byte of the 64KB table.  That is, it only holds:
  x^0 + x^1 + ... + x^7 multiplied by the hash key for all possible
  values (0x00-0xFF).
  @param hkey the authentication key
  @param t the table that will be initialized
  @retval 0 if table initializaiton fails
  @retval 1 if table initialization succeeds
*/
static void GCM_TAB_init_4kb(
			     unsigned char hkey[16],
			     unsigned char t[256][16])
{
  GCM_TAB_gen_8bit_table_n(0, hkey, t);
}

/*! @brief initialize the table for level 3 acceleration.
  This acceleration level maintains a table for all 16 possible values
  of a nibble (4 bits) for each of the 32 nibbles in an AES block (which
  is 16 bytes long).  That is, any given table t[i=0..32] represents:
  x^(i+0) + x^(i+1) + ... + x^(i+3) multipled by the hash key for all
  possible values (0x0-0xF).
  @param hkey the authentication key
  @param t the table that will be initialized
  @retval 0 if table initializaiton fails
  @retval 1 if table initialization succeeds
*/
static void GCM_TAB_init_8kb(
			     unsigned char hkey[16],
			     unsigned char t[32][16][16])
{
  int i;
  for (i = 0; i < 32; i++) {
    GCM_TAB_gen_4bit_table_n(i, hkey, t[i]);
  }
}

/*! @brief initialize the table for level 4 acceleration.
  This acceleration level maintains a table for all 256 possible values
  of a byte (8 bits) for each of the 16 bytes in an AES block (which is
  16 bytes long).  That is, any given table t[i=0..15] represents:
  x^(i+0) + x^(i+1) + ... + x^(i+7) multiplied by the hash key for
  all possible values (0x00-0xFF).
  @param hkey the authentication key
  @param t the table that will be initialized
  @retval 0 if table initializaiton fails
  @retval 1 if table initialization succeeds
*/
static void GCM_TAB_init_64kb(
			      unsigned char hkey[16],
			      unsigned char t[16][256][16])
{
  int i;
  for (i = 0; i < 16; i++) {
    GCM_TAB_gen_8bit_table_n(i, hkey, t[i]);
  }
}

static size_t GCM_table_size(GCM_ACCEL accel)
{
  size_t table_size = 0;
  switch (accel) {
  default:
  case GCM_ACCEL_noaccel:
    break;
  case GCM_ACCEL_level1:    
    /* 16 x 16 = 256 bytes */
    table_size = 256;
    break;
  case GCM_ACCEL_level2:   
    /* 256 x 16 = 4096 bytes */
    table_size = 4096;
    break;
  case GCM_ACCEL_level3:  
    /* 32 x 16 x 16 = 8192 bytes */
    table_size = 8192;
    break;
  case GCM_ACCEL_level4:
    /* 16 x 256 x 16 = 65536 bytes */
    table_size = 65536;
    break;
  }
  return table_size;
}
/*! @brief Allocate a new GCM acceleration table
  with enough space to support the requested level of acceleration
  @param accel the acceleration level to use
*/
static void * GCM_TAB_new(GCM_ACCEL accel)
{
  size_t table_size = 0;
  table_size = GCM_table_size(accel);
  return CRYPTO_malloc(table_size, __FILE__, __LINE__);
}

/*! @brief given a pointer to some memory call the appropriate
  table initialization routine.
  @param t the pointer to memory that will hold table
  @param accel the acceleration level for the table
  @retval 0 if table initialization fails
  @retval 1 if table initialization succeeds
*/
static int GCM_TAB_init(GCM_ACCEL accel, void *t, unsigned char hkey[16])
{
  int ret = 1;

  if (NULL != t && (accel == GCM_ACCEL_level1 || accel == GCM_ACCEL_level2
		    || accel == GCM_ACCEL_level3 || accel == GCM_ACCEL_level4)) {
    switch (accel) {
    case GCM_ACCEL_noaccel:
      break;
    case GCM_ACCEL_level1:
      GCM_TAB_init_256b(hkey, (unsigned char (*)[16]) t); /*[16][16]*/
      break;
    case GCM_ACCEL_level2:
      GCM_TAB_init_4kb(hkey, (unsigned char (*)[16]) t); /*[256][16]*/
      break;
    case GCM_ACCEL_level3:
      GCM_TAB_init_8kb(hkey, (unsigned char (*)[16][16]) t); /*[32][16][16]*/
      break;
    case GCM_ACCEL_level4:
      GCM_TAB_init_64kb(hkey, (unsigned char (*)[256][16]) t); /*[16][256][16]*/
      break;
    default:
      ret = 0;
      break;
    }
  }
  return ret;
}

/*! @brief clear/free a GCM acceleration table
  @param t a pointer to the table to clean/free
*/
static void GCM_TAB_free(GCM_ACCEL accel, void *t) {
  size_t table_size = 0;
  table_size = GCM_table_size(accel);
  if(NULL != t) {
    memset(t, 0x00, table_size);      
    CRYPTO_free(t);
  }
}

/** 
    @brief IV generator for AES_GCM IV's
    @param gcm_ctx An AES_GCM context
    @param out an 8 byte buffer which should contain reasonably non-repeating data 
    ideally at least 2^32 between repeats, and not predictable.
    @return 1 if O.K. 0 if the RNG found two adjacent repeated values
    @note
    - There are probably a lot of ways to do this, but the simplest to ensure: 
    - a) non-repitition over 2^32 calls
    - b) unpredictability of the data stream
    seems to be to encrypt a counter with (any) cipher with an 8 byte block size and
    a randomly generated key. 
    - To guarantee no repeats within our 8 byte output
    we need a cipher with an 8 byte blocksize so AES isn't usable.
    The 8 byte cipher blocksize guarantees a unqiue 1:1 transform
    between our input (counter) and the output and no repeats for the block size.
    - To avoid attacks, we generate a new random key every 2^32 iterations.
    - The choice of cipher itself shouldn't matter as we are only relying on this one 
    property (key specific 1:1 mapping of input to output over 64 bits),
    not it's strength, and over one block, all ciphers will be equally strong or weak
    WRT this. We chose to use blowfish because of it's speed.
    DES, 3DES or CAST could also be used, but offer no functional advantages
    The weak keys in DES cause problems they require reducing the key space  
    - We believe blowfish is usable here, even in FIPS approved code, because we 
    are NOT using it as an encryption function, simply as a way to guarantee 
    the unique 1:1 input/output mapping.
*/
   
int AES_GCM_GenerateIV(AES_GCM_CTX *gcm_ctx,unsigned char out[8]) 
{
  int rv = 1;
  AES_GCM_CTX_t *ctx = NULL;
  const EVP_CIPHER *cipher = NULL; 
  unsigned char IVkeybuf[16];  /*!< Transient encrypt key */
  int outl = 0;

  ctx = (AES_GCM_CTX_t *)gcm_ctx;
  
  /* Obtain the block cipher */
  if(NULL == cipher) {
    cipher = EVP_get_cipherbyname("blowfish");
  }
  /* Have we used this for IV generation before ? */
  if(NULL == ctx->IVctx) {
    /* Create an encryption context for this IV generator */
    ctx->IVctx = EVP_CIPHER_CTX_new();
  }
  /* Initialize from scratch the first time, 
     and every 2^32'th time round the block 
  */
  if(0 == ctx->count) {
    /* Initialize the counter to some random starting value */
    
    RAND_bytes(ctx->IVcounter,8);
    /* get a new seed from the TRNG for the transform key*/
    RAND_bytes(IVkeybuf,16);
    EVP_CIPHER_CTX_cleanup(ctx->IVctx);
    EVP_CIPHER_CTX_set_padding(ctx->IVctx,0);
    EVP_EncryptInit(ctx->IVctx,cipher,IVkeybuf,ctx->IVcounter);
  }
  /* Here we go, encrypt our randomly initialized counter with a random key */
  EVP_EncryptUpdate(ctx->IVctx,out,&outl,ctx->IVcounter,8);
  /* Increment the counter, 8 bytes of it in this case */
  AES_gcm_inc(ctx->IVcounter,8);
  /* Increment the number of cycles, and ensure we'll restart again at 2^32 */
  ctx->count = (ctx->count +1) & 0xffffffff;  
  return rv;
}
/*
 *
 * This algorithm assumes that the counter is in the x lower bits
 * of the IV (ivec), and that the application has full control over
 * overflow and the rest of the IV.  This implementation takes NO
 * responsibility for checking that the counter doesn't overflow
 * into the rest of the IV when incremented.
 */

/*! @brief allocate a new AES_GCM context
  @return a newly allocated AES_GCM context or NULL
*/
AES_GCM_CTX * AES_GCM_CTX_new()
{

  AES_GCM_CTX_t *ctx;
  ctx = (AES_GCM_CTX_t *)CRYPTO_malloc(sizeof(AES_GCM_CTX_t),__FILE__,__LINE__);

  if(NULL != ctx) {
    memset(ctx,0,sizeof(AES_GCM_CTX_t));
  }
  return ((AES_GCM_CTX *)ctx);
}

/*! @brief clear/free a previously allocated AES_GCM context/IV context
  @param ain the AES_GCM context to clear/free
*/
void AES_GCM_CTX_free(AES_GCM_CTX *ain)
{
  AES_GCM_CTX_t *a = (AES_GCM_CTX_t *)ain;
  if((a->tab.accel != GCM_ACCEL_noaccel) && (a->tab.tabdata != NULL)) {
    GCM_TAB_free(a->tab.accel, a->tab.tabdata);
    a->tab.tabdata = NULL;
  }
  /* If the IV part of the context was in use, clean that up */
  if(NULL != a->IVctx) {
    EVP_CIPHER_CTX_cleanup(a->IVctx);
    EVP_CIPHER_CTX_free(a->IVctx);      
  }
  if(NULL != a->ctx) {
    EVP_CIPHER_CTX_cleanup(a->ctx);
    EVP_CIPHER_CTX_free(a->ctx);      
  }
  /* Clear everything */
  memset(a,0,sizeof(AES_GCM_CTX_t));
  CRYPTO_free(a);
}
/*! @brief Multiply X by Y in GF(2^128) into Z using no optimization.
  @param Z The output for the multiplication
  @param X X is multiplied by Y
  @param Y X is multiplied by Y
  @return 0 on success
*/
static int GCM_mult_noaccel(
			    unsigned char *Z,
			    unsigned char *X,
			    unsigned char *Y)
{
  /* Note our endianess of the field polynomial's decription in the spec
     i.e. a + x1a + x2a^2 + x3a^3 is little endian
     form and the operations are defined as though
     the input vectors are also little endian
     (confusing)
  */
  /*                           bit 0   ,1   ,2   ,3   ,4   ,5   ,6   ,7 */
  static unsigned char mask[8] = { 0x80,0x40,0x20,0x10,0x08,0x04,0x02,0x01};
  int rv = 0;
  int i = 0;
  /* In the implementation of this, X or Y is the same as Z
     so we want to make sure we aren't rewriting our input as we go
  */
  unsigned char T[AES_BLOCK_SIZE];
  unsigned char V[AES_BLOCK_SIZE];
  /*
    printbin("   GCM X",X,16);
    printbin("   GCM Y",Y,16);
  */
  memset(T,0,AES_BLOCK_SIZE);
  memcpy(V,X,AES_BLOCK_SIZE);
  for( i = 0; i < 128 ; i++) {

    /* if Yi == 0, then adding this term is a NOP */
    /* if Yi != 0, the XOR is the addition operation as all terms are independent */
    if( 0 != (Y[i>>3] & mask[i&7]) ) {
      xor(T,T,V,AES_BLOCK_SIZE);
    }
    /* If the MSBit of the shifted X vector is 0, then there's no need to 
       divide by the field polynomial, as we can't possibly overflow 
    */
    if ( 0 == ((V[15] & 0x01)) ) {
      shr(V,AES_BLOCK_SIZE);
    } else { 
      /* In a Galois field no one can hear you scream ...
	 We are after the 128bit remainder ...
      */
      shr(V,AES_BLOCK_SIZE);
      V[0] = V[0] ^ 0xe1;
    }
  } 
  memcpy(Z,T,AES_BLOCK_SIZE);
  /*printbin("   GCM Z",Z,16);*/
  return rv;
}
/*! @brief Multiply X by some value that has had some precalculation done
  into t.  Place result in Z
  @param Z The result of the muliplication
  @param X The value to be multiplied by the value precalculated in t
  @param t Precalculated values for some value Y times possible values of X
  @return 0 on success
*/
static int GCM_mult_level1(
			   unsigned char *Z,
			   unsigned char *X,
			   unsigned char t[16][16])
{
  int i;
  int j;
  int tabi;
  int remi;
  unsigned char tmp[16];

  /* The table at this level only has those value in the first table
     for level3 acceleration, which means we need to shift everything
     over as we index the table (multiply).  We start at the high
     degrees and move down so that the values at the high degree
     get multiplied over (shifted right) as we go.  Remember, we're
     doing everything in 4 bit increments so be careful to use the
     correct nibble of each byte processed */
  memcpy(tmp, X, 16);
  memset(Z, 0x00, 16);
  for (i = 31; i > 0; i--) {
    tabi = (i & 0x01) ? tmp[i >> 1] & 0x0F : (tmp[i >> 1] >> 4) & 0x0F;
    xor(Z, Z, t[tabi], 16);

    remi = Z[15] & 0x0F;
    for (j = 31; j > 0; j--) {
      Z[j >> 1] = (j & 0x01)
	? (Z[j >> 1] >> 4) & 0x0F
	: ((Z[(j >> 1) - 1] << 4) & 0xF0) | Z[j >> 1];
    }
    Z[0] &= 0x0F;
    Z[0] ^= rem_table_4bit[remi][0];
    Z[1] ^= rem_table_4bit[remi][1];
  }

  tabi = (tmp[0] >> 4) & 0x0F;
  xor(Z, Z, t[tabi], 16);

  return 0;
}
/*! @brief Multiply X by some value that has been precalculated into t and
  place the result into Z.
  @param Z The result of the multiplication
  @param X The value to be multiplied by the precalculated value in t
  @param t Precalculated value for some value Y times possible values of X
  @return 0 on success
*/
static int GCM_mult_level2(
			   unsigned char *Z,
			   unsigned char *X,
			   unsigned char t[256][16])
{
  int i;
  int j;
  int remi;
  unsigned char tmp[16];

  /* The table, at this level, has only those value in the first table for
     level 4 acceleration.  That is, the hash key times all possible 2^8
     values, for 1 + x + x^2 + ... + x^7.  So, we need to multiply everything
     by x^8 however many times to get them in the appropriate location.  To
     do this, we start with the high degree value, find their index and shift
     them over by  only bytes.  We continue to do this until the last
     iteration of this loop when the higher degrees will have been shifted
     the correct number of times.  The last byte (lowest 8 degree terms) are
     simply added into the result since they don't need to be shifted. */
  memcpy(tmp, X, 16);
  memset(Z, 0x00, 16);
  for (i = 15; i > 0; i--) {
    xor(Z, Z, t[tmp[i]], 16);

    remi = Z[15];
    for (j = 15; j > 0; j--) {
      Z[j] = Z[j - 1];
    }
    Z[0] = rem_table_8bit[remi][0];
    Z[1] ^= rem_table_8bit[remi][1];
  }
  xor(Z, Z, t[tmp[0]], 16);
  return 0;
}
/*! @brief Multiply X by some value that has been precalculated into t and
  place the result into Z.
  @param Z The result of the multiplication
  @param X The value to be multiplied by the precalculated value in t
  @param t Precalculated value for some value Y times possible values of X
  @return 0 on success
*/
static int GCM_mult_level3(
			   unsigned char *Z,
			   unsigned char *X,
			   unsigned char t[32][16][16])
{
  int i;
  int tabi;
  unsigned char tmp[16];

  /* Everything has been precalculated, so just loop through each 4 bit
     chunk and the corrresponding table and add the value at that point
     of the table into the result. */
  memcpy(tmp, X, 16);
  memset(Z, 0x00, 16);
  for (i = 0; i < 32; i++)  {
    tabi = (i & 0x01) ? tmp[i >> 1] & 0x0F : (tmp[i >> 1] >> 4) & 0x0F;
    xor(Z, Z, t[i][tabi], 16);
  }
  return 0;
}
/*! @brief Multiply X by some value that has been precalculated into t and
  place the result into Z.
  @param Z The result of the multiplication
  @param X The value to be multiplied by the precalculated value in t
  @param t Precalculated value for some vaule Y times possible values of X
  @return 0 on success
*/
static int GCM_mult_level4(
			   unsigned char *Z,
			   unsigned char *X,
			   unsigned char t[16][256][16])
{
  int i;
  unsigned char tmp[16];

  /* Everything has been precalculated, so just loop through each byte and
     the corresponding table and add the value at that point in the table
     onto the result in Z. */
  memcpy(tmp, X, 16);
  memset(Z, 0x00, 16);
  for (i = 0; i < 16; i++) {
    xor(Z, Z, t[i][tmp[i]], 16);
  }
  return 0;
}
/*!
  @brief Multiply two bit strings in a Galois field
  @param tab a "short cut" table for algorithm acceleration
  There are multiple space/time tradeoffs possible,
  so I expect this to encapsulate multiple possible methods
  up to and including tab == NULL, i.e. we don't need no
  steenkin accleration, in which case I brute force
  the algorithm
  @param tab a pointer to the GCM acceleration table
  @param Z  result vector
  @param X  input vector1
  @param Y  input vector2.  If tab->accel != GCM_ACCEL_noaccel than this
  parameter is ignored and tab is used, instead.
  @return 0 == O.K. 1 = invalid parameter
  @note This is really screwed up
  The GCM operation is defined on a little endian field but all the
  data is kept in it's natural form (big endian)
  That makes this really really ugly.
*/
static int GCM_mult(GCM_TAB *tab,
		    unsigned char *Z,
		    unsigned char *X,
		    unsigned char *Y)
{
  int ret = 0;
  switch (tab->accel) {
  case GCM_ACCEL_noaccel:
    ret = GCM_mult_noaccel(Z, X, Y);
    break;
  case GCM_ACCEL_level1:
    ret = GCM_mult_level1(Z, X, (unsigned char (*)[16]) tab->tabdata);
    break;
  case GCM_ACCEL_level2:
    ret = GCM_mult_level2(Z, X, (unsigned char (*)[16]) tab->tabdata);
    break;
  case GCM_ACCEL_level3:
    ret = GCM_mult_level3(Z, X, (unsigned char (*)[16][16]) tab->tabdata);
    break;
  case GCM_ACCEL_level4:
    ret = GCM_mult_level4(Z, X, (unsigned char (*)[256][16]) tab->tabdata);
    break;
  default:
    ret = 1;
    break;
  }
  return ret;
}
/*! 
  @brief Public (test) entry point for the Galois multiplier
  Multiply two 16 byte vectors (X & Y) in a 128 bit Galois field
  @param gcm_ctx an ICC AES_GCM context
  @param Z the result
  @param X the X input vector
  @param Y the Y input vector
  @return always 0
  @note This routine was provided ONLY to allow us to debug and performance tune
  the AES-GCM code 
  @note if supported, any acceleration set up the the AES_GCM context should 
  work for this routine
  @note the field chosen is 1 + a +a^2 + a^7 + a^128
*/
int GCM128(AES_GCM_CTX *gcm_ctx,unsigned char *Z, unsigned char *X, unsigned char *Y)
{   
  return GCM_mult(&(((AES_GCM_CTX_t *)gcm_ctx)->tab),Z,X,Y);
}
/** @brief
    Setup AES-GCM acceleration tables
    @param a an AES_GCM_CTX template
    @param accel the speed/space tradeoff preferred
    @return The acceleration state
*/
static int aes_gcm_accel(AES_GCM_CTX_t *a, int accel)
{
  int ret = 0;
  void *tab = NULL;
    
  /* if the current acceleration is the same as the
     requested, then simply return success. */
  if (a->tab.accel == accel) {
    ret = 1;
  } else {
    tab = GCM_TAB_new(accel);
    ret = GCM_TAB_init(accel, tab, a->H);
    if (ret == 0) {
      GCM_TAB_free(accel, tab);
    } else {
      GCM_TAB_free(a->tab.accel, a->tab.tabdata);
      a->tab.accel = accel;
      a->tab.tabdata = tab;
    }
  }
  return ret;
}
/** @brief the ctrl interface for AES GCM.
    @param a an AES_GCM_CTX context
    @param type the type of ctrl to run
    @param val the value corresponding to type
    @param ptr a pointer to data corresponding to type
    @retval 1 on success
    @retval 0 on failure
*/

int AES_GCM_CTX_ctrl(AES_GCM_CTX *ain, int type, int val, void *ptr)
{
  AES_GCM_CTX_t *a = (AES_GCM_CTX_t *)ain;
  int ret = 0;

  switch (type) {
  case AES_GCM_CTRL_SET_ACCEL:
    ret = aes_gcm_accel((AES_GCM_CTX_t *) a, val);
    break;
  case AES_GCM_CTRL_GET_ACCEL:
    *(int *)ptr = a->tab.accel;
    ret = 1;
    break;
  default:
    ret = 0;
    break;
  }

  return ret;
}

/** @brief Calculate the GHASH of an arbitrary data stream 
    @param tab the GSM speedup table. (if one exists)
    @param H the hash key
    @param Y is the output, (AES_BLOCK_SIZE long)
    @param X the input data
    @param xlen is the length of the input data 0 <= X <= 2^64 BITS
    @return 1 if O.K. 0 on error (X is too long)
    @note The last block of X will be 0 padded - so no partial blocks
    unless you want this.
    @note Y should be initialized to 0 if this is the first pass
    otherwise, Y is the output from the previous invocation and
    the GHASH can be chained.   
*/

static void GHASH_i(GCM_TAB *tab,unsigned char *H,unsigned char *Y, unsigned char *X,unsigned long xlen) 
{  
  unsigned char tmp[AES_BLOCK_SIZE];
     
  for( ; xlen >= AES_BLOCK_SIZE; xlen -= AES_BLOCK_SIZE) {
    printbin("GHASH",X,AES_BLOCK_SIZE);
    xor(Y,X,Y,AES_BLOCK_SIZE);
    GCM_mult(tab,Y,Y,H);
    printN(Y,AES_BLOCK_SIZE);
    X += AES_BLOCK_SIZE;
  }
  /* Last block */
  if( xlen > 0 ) {
    memset(tmp,0,AES_BLOCK_SIZE);
    memcpy(tmp,X,xlen);
    printbin("GHASH",X,AES_BLOCK_SIZE);
    printN(Y,AES_BLOCK_SIZE);
    xor(Y,tmp,Y,AES_BLOCK_SIZE);
    GCM_mult(tab,Y,Y,H);
  }
}
/** @brief Calculate the GHASH of an arbitrary data stream
    @param a an AES GSM context
    @param H the hash key
    @param Hash is the input/output, (AES_BLOCK_SIZE long)
    @param X the input data
    @param xlen is the length of the input data 0 <= X <= 2^64 BITS
    @return 1 if O.K. 0 on error (X is too long)
    @note The last block of X will be 0 padded - so no partial blocks
    unless you want this.
    @note Y should be initialized to 0 if this is the first pass
    otherwise, Y is the output from the previous invocation and
    the GHASH can be chained.   
*/
void GHASH(AES_GCM_CTX *gcm_ctx,unsigned char *H, unsigned char *Hash, unsigned char *X,unsigned long Xlen)
{
  GHASH_i(&(((AES_GCM_CTX_t *)gcm_ctx)->tab),H,Hash,X,Xlen);  
}


/** @brief
    Initialize an AES GCM operation, provide the initialization data and
    key
    @param ain an AES_GCM_CTX context
    @param iv The IV, can be 1-2^56 bytes long, 12 bytes is best
    @param ivlen the length of the IV
    @param key an aes key 16,24 or 32 bytes long
    @param klen the length of the aes key
    @return 1 if O.K., 0 otherwise
*/ 
int AES_GCM_Init(AES_GCM_CTX *ain,
		 unsigned char *iv,unsigned long ivlen,
		 unsigned char *key, unsigned int klen

		 )
{
  AES_GCM_CTX_t *a = (AES_GCM_CTX_t *)ain;
  int rv = 1;
  int outl = 0;
  const EVP_CIPHER *cipher = NULL;
#if defined(AES_GCM_DEBUG)
  Xc = 1;
  Yc = 0;
  Nc = 1;
#endif
  /* 
     If a key was already setup ...
     It's possible to call Init without specifying a key
     in which case a previous key/GCM tables will be reused
  */
  if(NULL != key) {
    /* check for a valid AES key */
    switch(klen) {
    case 16:
      cipher = EVP_get_cipherbyname("AES-128-ECB");
      break;
    case 24:
      cipher = EVP_get_cipherbyname("AES-192-ECB");
      break;
    case 32:
      cipher = EVP_get_cipherbyname("AES-256-ECB");
      break;
    default:
      rv = 0;
      break;
    }
  }
  /* we need to have either a cipher or a previously constructed ctx */
  if(NULL == cipher && (NULL == a->ctx)) {
    rv = 0;
  }
  /* However the context and iv must be present */
  if(NULL == a || NULL == iv) {
    rv = 0;
  }
  if( 1 == rv) {
    /* Clean stuff that needs to be cleared ... */
    memset(a->Y,0,AES_BLOCK_SIZE);
    memset(a->R,0,AES_BLOCK_SIZE);
    a->state = 0;
    a->LenA = 0;
    a->LenC = 0;
    /* Create a key if we need one */
    if(NULL == a->ctx) {
      void *tab = NULL;
      a->ctx = EVP_CIPHER_CTX_new();
      EVP_CIPHER_CTX_set_padding(a->ctx,0);
      EVP_EncryptInit(a->ctx,cipher,key,NULL);

      /* generate the hash key, which is 0 encrypted with the AES key */
      EVP_EncryptUpdate(a->ctx,a->H,&outl,&zero[0],AES_BLOCK_SIZE);
      /* AES_encrypt(zero,a->H,&(a->key)); */
      /* set up acceleration for the key.  We can free it if it is not
         NULL, but it's probably best to assume we need a new allocation. 
      */
      tab = GCM_TAB_new(a->tab.accel);
      if (1 == (rv = GCM_TAB_init(a->tab.accel, tab, a->H))) {
	GCM_TAB_free(a->tab.accel, a->tab.tabdata);
	a->tab.tabdata = tab;
      } else {
	GCM_TAB_free(a->tab.accel, tab);
      }
    } /* Otherwise we assume this was already set up */
  }

  if( 1 == rv ) {
      
    /* now eat the IV */
    if(ivlen == 12) {
      memcpy(a->ecount_buf,iv,12);
      memset(a->ecount_buf+12,0,4); /* Zero the count buffer */
      a->ecount_buf[15] = 1; /* Counter starts at 1 ??? */  
    } else {
      /* NIST appear to have screwed up here, 
	 I can't find this documented 
      */
      memset(a->ecount_buf,0,AES_BLOCK_SIZE);
      GHASH_i(&(a->tab),a->H,a->ecount_buf,iv,ivlen);
      /* and add in the BIT length of the IV  64 bits */
      ULongToChar(ivlen*8,a->R+8);
      printbin("0^{64}||len(IV)",a->R,AES_BLOCK_SIZE);
      GHASH_i(&(a->tab),a->H,a->ecount_buf,a->R,AES_BLOCK_SIZE);
      memset(a->R,0,AES_BLOCK_SIZE);
    }
    printY(a->ecount_buf,AES_BLOCK_SIZE);
    EVP_EncryptUpdate(a->ctx,a->eZero,&outl,a->ecount_buf,AES_BLOCK_SIZE);
    /* AES_encrypt(a->ecount_buf,a->eZero,&(a->key)); */
    printEY(a->eZero,AES_BLOCK_SIZE);
  }

  return rv;
}

/** @brief
    Merge in any pending aad data to the ciphertext 
    @param a an AES_GCM_CTX context
*/
static void GCM_merge_aad(AES_GCM_CTX_t *a) {
  if( 1 == a->state ) { /* We've had aad presented and possibly unprocessed */
    if( a->Rlen > 0 ) { /* There's some aad unprocessed */
      /* Generate the GMAC of the last of the aad */
      GHASH_i(&(a->tab),a->H,a->Y,a->R,a->Rlen);
      /* Update the length of the aad */
      a->LenA += a->Rlen * 8;
      printX(a->Y,AES_BLOCK_SIZE);

    }
    /* Clear our partial hash and retained unhashed data buffers */
    a->Rlen = 0;
    memset(a->R,0,AES_BLOCK_SIZE);
  }
  a->state = 2;
}
/**
   @brief Common code, does the encrypt/hash core code

   @param a an AES_GCM_CTX context
   @param out the output buffer 
   @param outlen a place to store the amount of data copied to the output buffer
   @param encrypt 1 = encrypt, 0 = decrypt
   @note Uses a->R/a->Rlen to hold/describe the amount of data
   waiting the be encrypted/decrypted
*/
static void ED_block(AES_GCM_CTX_t *a,unsigned char **out,unsigned long *outlen,int encrypt) 
{
  int outl = 0;
  if(a->Rlen) {
    AES_gcm_inc(a->ecount_buf+12,4);
    printY(a->ecount_buf,AES_BLOCK_SIZE);
    /* AES encrypt the IV + counter with the AES key */
    EVP_EncryptUpdate(a->ctx,a->S,&outl,a->ecount_buf,AES_BLOCK_SIZE);
    /* AES_encrypt(a->ecount_buf,a->S,&(a->key)); */
    printEY(a->S,AES_BLOCK_SIZE);
    /* XOR with the input data , which may be a partial block */
    memset((a->R+a->Rlen),0,AES_BLOCK_SIZE - a->Rlen);
    memset((a->S+a->Rlen),0,AES_BLOCK_SIZE - a->Rlen);
    xor(a->S,a->S,a->R,a->Rlen);
    /* That's either generated plaintext, or in reverse, cipher text 
       either way - that gets copied to the output buffer
    */
    memcpy(*out,a->S,a->Rlen);
    *out += a->Rlen;
    /* Update the counters */
    if(NULL != outlen) *outlen += a->Rlen;
    a->LenC += (a->Rlen * 8);
    if(a->state == 2) { /* We had aad data to mix in, that was in a->Y (last hash) */	 		
      a->state = 3;
    }  
    if(encrypt) { /* Encrypt: We mix the result of the "last" hash - which may be aad with ciphertext (a->S) */
      /* printbin("Enc CT",a->S,a->Rlen); */
      xor(a->Y,a->S,a->Y,AES_BLOCK_SIZE);
      
      /* printbin("Enc Xor CT",a->Y,16); */
    } else {      /* Decrypt: We mix the result of the "last" hash - which may be aad with incoming ciphertext (a->R) */    
      /* printbin("Dec CT",a->R,a->Rlen); */
      xor(a->Y,a->R,a->Y,AES_BLOCK_SIZE); 
      /* printbin("Dec Xor CT",a->Y,16); */ 
    }
    /* The residual buffer is now 'empty' */
    a->Rlen = 0;
    /* generate the next hash block */
    /* printbin("H",a->H,16); */
    /* printbin("CT^Last",a->Y,16); */
    GCM_mult(&(a->tab),a->Y,a->Y,a->H); 
    printX(a->Y,AES_BLOCK_SIZE);
    
    /* printbin("GCM128(CT^Last)",a->Y,16); */ 
  }
}

/** @brief Perform an AES_GCM_CTX "update" operation

@param a the (opaque) AES_GCM_CTX context
@param aad additional authentication data (hashed, not encrypted)
@param aadlen the length of the aad 0-2^56 bytes long
@param data data to encrypt
@param datalen the length of the encrypted data
@param out the output buffer, should be at least "data" long
@param outlen a place to store the length of the output data
@param encrypt 1 if encrypting, 0 if decrypting
@return 1 if O.K., 0 otherwise
@note aad may only be specified up to the point where input data is being supplied
we do support streaming aad up until that point.
- i.e. we allow multiple chunks of aad to be specified provided data is NULL until
the last aad chunk
- multiple chunks of data, but aad must be NULL after the first chunk
- There's very little difference between encrypt and decrypt in AES_GCM
so we've rolled the code into one routine with a wrapper to simplify
maintenance.
*/
static int AES_GCM_Update(AES_GCM_CTX_t *a,
			  unsigned char *aad,unsigned long aadlen,
			  unsigned char *data,unsigned long datalen,
			  unsigned char *out, unsigned long *outlen,
			  int encrypt
			  )
{
  int rv = 1;
  int i;
  if(NULL !=outlen) {
    *outlen = 0;
  }
  if((aadlen != 0) && (NULL != aad)) {
    if(a->state > 1) { /* We've started processing data blocks ... */
      rv = 0; /* bad, all the aad must be fed in BEFORE we process data */
    } else {
      a->state = 1; /* Flag that we've been supplied at least some aad */ 
      if( (aadlen + a->Rlen) >= AES_BLOCK_SIZE) { 
	/* At least one full block */
	for( i = aadlen +a->Rlen ; i >= AES_BLOCK_SIZE; i -= AES_BLOCK_SIZE) {
	  /* Top up the retain buffer from incoming */
	  memcpy(a->R + a->Rlen,aad,AES_BLOCK_SIZE - a->Rlen);
	  /* We stole this many bytes of aad */
	  aad += AES_BLOCK_SIZE - a->Rlen;
	  aadlen -= (AES_BLOCK_SIZE - a->Rlen);
	  a->Rlen = 0;
	  /* Generate the GMAC of the aad */
	  GHASH_i(&(a->tab),a->H,a->Y,a->R,AES_BLOCK_SIZE);
	  printX(a->Y,AES_BLOCK_SIZE);
	  /* accumulate the number of bits */
	  a->LenA += AES_BLOCK_SIZE * 8;
	}		
      } 
      if( aadlen ) { 
	memcpy(a->R + a->Rlen,aad,aadlen);
	a->Rlen += aadlen;
      }	    
    }
  }
  if( (1 == rv) && (datalen != 0) && (NULL != data) ) {
    /* Check to see if we have any partial aad left to process ? */
    GCM_merge_aad(a); 
    a->state = 2;
    if( (datalen + a->Rlen) >= AES_BLOCK_SIZE) { /* Do we have a complete data block ? */
      for( i = datalen + a->Rlen ; i >= AES_BLOCK_SIZE; i -= AES_BLOCK_SIZE) {
	/* Top up the retain buffer from incoming */
	memcpy(a->R + a->Rlen,data,AES_BLOCK_SIZE-a->Rlen);
	/* we stole this many bytes of input */
	data += AES_BLOCK_SIZE-a->Rlen;
	datalen -= ( AES_BLOCK_SIZE-a->Rlen);
	a->Rlen = AES_BLOCK_SIZE;
	ED_block(a,&out,outlen,encrypt);
      }
    } 
    if(datalen) { /* Not a complete block yet ... */
      memcpy(a->R + a->Rlen,data,datalen);
      a->Rlen += datalen;
    }
  } /* NULL != data */

  return rv;
}
/** @brief Perform an AES_GCM_CTX "updateEncrypt" operation

@param a the (opaque) AES_GCM_CTX context
@param aad additional authentication data (hashed, not encrypted)
@param aadlen the length of the aad 0-2^56 bytes long
@param data data to encrypt
@param datalen the length of the encrypted data
@param out the output buffer, should be at least "data" long
@param outlen a place to store the length of the output data
@return 1 if O.K., 0 otherwise
@note aad may only be specified up to the point where input data is being supplied
we do support streaming aad up until that point.
- i.e. we allow multiple chunks of aad to be specified provided data is NULL until
the last aad chunk
- multiple chunks of data, but aad must be NULL after the first chunk
- There's very little difference between encrypt and decrypt in AES_GCM
so we've rolled the code into one routine with a wrapper to simplify
maintenance.
*/
int AES_GCM_EncryptUpdate(AES_GCM_CTX *a,
			  unsigned char *aad,unsigned long aadlen,
			  unsigned char *data,unsigned long datalen,
			  unsigned char *out, unsigned long *outlen)
{

  return AES_GCM_Update((AES_GCM_CTX_t *)a,aad,aadlen,data,datalen,out,outlen,1);
}

/** @brief Perform an AES_GCM_CTX "updateDecrypt" operation

@param a the (opaque) AES_GCM_CTX context
@param aad additional authentication data (hashed, not encrypted)
@param aadlen the length of the aad 0-2^56 bytes long
@param data data to encrypt
@param datalen the length of the encrypted data
@param out the output buffer, should be at least "data" long
@param outlen a place to store the length of the output data
@return 1 if O.K., 0 otherwise
@note aad may only be specified up to the point where input data is being supplied
we do support streaming aad up until that point.
- i.e. we allow multiple chunks of aad to be specified provided data is NULL until
the last aad chunk
- multiple chunks of data, but aad must be NULL after the first chunk
- There's very little difference between encrypt and decrypt in AES_GCM
so we've rolled the code into one routine with a wrapper to simplify
maintenance.
*/
int AES_GCM_DecryptUpdate(AES_GCM_CTX *a,
			  unsigned char *aad,unsigned long aadlen,
			  unsigned char *data,unsigned long datalen,
			  unsigned char *out, unsigned long *outlen)
{

  return AES_GCM_Update((AES_GCM_CTX_t *)a,aad,aadlen,data,datalen,out,outlen,0);
}

/** @brief
    The final phase of an AES GCM operation

    @param ain an AES_GCM_CTX pointer
    @param out the buffer to hold any residual encrypted data 
    @param outlen a place to hold the length of any residual data
    @param hash a place to store AES_BLOCK_SIZE bytes of the 
    authentication tag
    @param encrypt 1 if encrypting, 0 if decrypting
    @return 1 if O.K., 0 otherwise
*/
static int AES_GCM_Final(AES_GCM_CTX *ain,
			 unsigned char *out, unsigned long *outlen,
			 unsigned char *hash,int encrypt)
{
  AES_GCM_CTX_t *a = (AES_GCM_CTX_t *)ain;
  int rv = 1;
  *outlen = 0;
  if( a->Rlen ) { /* Do we have residual data ? */
    if(a->state == 1) {
      GCM_merge_aad(a); 
    } else if(a->state >= 2) {
      ED_block(a,&out,outlen,encrypt);
    }
  }
  ULongToChar(a->LenA,a->S);
  ULongToChar(a->LenC,a->S+8);
  printbin("LenALenC",a->S,16);
  /*
    xor(a->Y,a->Y,a->S,AES_BLOCK_SIZE);

    printbin("Xor'd",a->Y,16);
  */
  GHASH_i(&(a->tab),a->H,a->Y,a->S,AES_BLOCK_SIZE);
  xor(a->Y,a->eZero,a->Y,AES_BLOCK_SIZE);
  memcpy(hash,a->Y,AES_BLOCK_SIZE);
  return rv;
}
/** @brief
    The final phase of an AES GCM operation

    @param ain an AES_GCM_CTX pointer
    @param out the buffer to hold any residual encrypted data 
    @param outlen a place to hold the length of any residual data
    @param hash a place to store AES_BLOCK_SIZE bytes of the 
    authentication tag
    @return 1 if O.K., 0 otherwise
*/
int AES_GCM_EncryptFinal(AES_GCM_CTX *ain,
			 unsigned char *out, unsigned long *outlen,
			 unsigned char *hash)
{
  return AES_GCM_Final(ain,out,outlen,hash,1);
}

/** @brief
    The final phase of an AES GCM decrypt operation

    @param ain an AES_GCM_CTX pointer
    @param out the buffer to hold any residual encrypted data 
    @param outlen a place to hold the length of any residual data
    @param hash a place to store AES_BLOCK_SIZE bytes of the 
    authentication tag
    @return 1 if the operation completed with the correct hash, 0 otherwise
*/
int AES_GCM_DecryptFinal(AES_GCM_CTX *ain,
			 unsigned char *out, unsigned long *outlen,
			 unsigned char *hash,unsigned int hlen)
{
  int rv = 0;
  unsigned char ihash[AES_BLOCK_SIZE];
  memset(ihash,0,AES_BLOCK_SIZE);
  rv = AES_GCM_Final(ain,out,outlen,ihash,0);
  printbin("Tag",ihash,AES_BLOCK_SIZE);
  if(1 == rv ) {
    if((hlen == 0) || 
       (hlen > AES_BLOCK_SIZE) || 
       (memcmp(ihash,hash,hlen) != 0)) {
      rv = 0;
    }
  }    
  return rv;
}



