OK, so I had more time to look at the issue mentioned by Uri.

Below and attached is the patch I am proposing that fixes the issue by 
removing some of the undefined behavior that caused the problem.

Comments and objections, please.

**********

It appears there's a bad interaction between 
CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS, IsAligned(), IsAlignedOn(), 
GetAlignmentOf(), OptimalDataAlignment() and the ability to use unaligned 
data on i386 and x86_64. Since i386 and x86_64 allow unaligned data, we 
effectively have this goodness:

template <class T>
inline unsigned int GetAlignmentOf(T *dummy=NULL)    // VC60 workaround
{
#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
    if (sizeof(T) < 16)
        return 1;
#endif

#if (_MSC_VER >= 1300)
    return __alignof(T);
#elif defined(__GNUC__)
    return __alignof__(T);
#elif CRYPTOPP_BOOL_SLOW_WORD64
    return UnsignedMin(4U, sizeof(T));
#else
    return sizeof(T);
#endif
}

However, the logic does not account for the compiler generating SSE 
instructions. And we can only guess when that's going to happen....

So, to fix this issue for now, we have the patch below. First, it makes 
available CRYPTOPP_NO_UNALIGNED_DATA_ACCESS. This unconditionally stops the 
problem. It also helps on other platforms, like early ARM and embedded 
devices.

Second, it changes GetAlignmentOf to the follow. We make strict=true when 
we know vectorization is occurring.

    template <class T, bool STRICT>
        inline unsigned int GetAlignmentOf(T *dummy=NULL)

With GetAlignmentOf fixed, IsAligned and IsAlignedOn falls into place:

    template <class T, bool STRICT>
        inline bool IsAligned(const void *p, T *dummy=NULL) {
            return IsAlignedOn(p, GetAlignmentOf<T, STRICT>());
        }

Third, we just need to fix the problems. We know SHA-3 is a problem due to 
misc.cpp and xorbuf. So we make strict=true in them. Otherwise, we preserve 
existing behavior and set strict=false.

diff --git a/GNUmakefile b/GNUmakefile
index ca36725..045203b 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -1,6 +1,6 @@
 CXXFLAGS ?= -DNDEBUG
 SYMBOLS ?= -g2
-OPTIMIZE ?= -O2
+OPTIMIZE ?= -O3
 # -fPIC is supported, and enabled by default for x86_64.
 # the following options reduce code size, but breaks link or makes link 
very slow on some systems
 # CXXFLAGS += -ffunction-sections -fdata-sections
@@ -44,6 +44,11 @@ endif
 # Merge symbols and optimizations
 CXXFLAGS += $(SYMBOLS) $(OPTIMIZE)
 
+# GCC 4.8 and above vectorizes some operations, which creates alignment 
problems
+# ifeq ($(filter $(CXXFLAGS),-O3),-O3)
+# CXXFLAGS += -DCRYPTOPP_NO_UNALIGNED_DATA_ACCESS
+# endif
+
 ifeq ($(IS_X86),1)
 
 GCC42_OR_LATER = $(shell $(CXX) -v 2>&1 | $(EGREP) -c "^gcc version 
(4.[2-9]|[5-9])")
diff --git a/config.h b/config.h
index 7472ca0..21f6d4f 100644
--- a/config.h
+++ b/config.h
@@ -342,7 +342,7 @@ NAMESPACE_END
     #define CRYPTOPP_BOOL_X86 0
 #endif
 
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 || defined(__powerpc__)
+#if !defined(CRYPTOPP_NO_UNALIGNED_DATA_ACCESS) && (CRYPTOPP_BOOL_X64 || 
CRYPTOPP_BOOL_X86 || defined(__powerpc__))
     #define CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
 #endif
 
diff --git a/crc.cpp b/crc.cpp
index 10c25c2..3b73c53 100644
--- a/crc.cpp
+++ b/crc.cpp
@@ -126,7 +126,7 @@ void CRC32::Update(const byte *s, size_t n)
 {
     word32 crc = m_crc;
 
-    for(; !IsAligned<word32>(s) && n > 0; n--)
+    for(; !IsAligned<word32, false>(s) && n > 0; n--)
         crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc);
 
     while (n >= 4)
diff --git a/cryptlib.cpp b/cryptlib.cpp
index a9ed290..6683129 100644
--- a/cryptlib.cpp
+++ b/cryptlib.cpp
@@ -183,17 +183,17 @@ size_t 
BlockTransformation::AdvancedProcessBlocks(const byte *inBlocks, const by
 
 unsigned int BlockTransformation::OptimalDataAlignment() const
 {
-    return GetAlignmentOf<word32>();
+    return GetAlignmentOf<word32, false>();
 }
 
 unsigned int StreamTransformation::OptimalDataAlignment() const
 {
-    return GetAlignmentOf<word32>();
+    return GetAlignmentOf<word32, false>();
 }
 
 unsigned int HashTransformation::OptimalDataAlignment() const
 {
-    return GetAlignmentOf<word32>();
+    return GetAlignmentOf<word32, false>();
 }
 
 void StreamTransformation::ProcessLastBlock(byte *outString, const byte 
*inString, size_t length)
diff --git a/iterhash.cpp b/iterhash.cpp
index 1e31e9f..e9a812f 100644
--- a/iterhash.cpp
+++ b/iterhash.cpp
@@ -50,7 +50,7 @@ template <class T, class BASE> void IteratedHashBase<T, 
BASE>::Update(const byte
             HashBlock(dataBuf);
             return;
         }
-        else if (IsAligned<T>(input))
+        else if (IsAligned<T, false>(input))
         {
             size_t leftOver = HashMultipleBlocks((T *)input, len);
             input += (len - leftOver);
@@ -138,7 +138,7 @@ template <class T, class BASE> void IteratedHashBase<T, 
BASE>::TruncatedFinal(by
 
     HashBlock(dataBuf);
 
-    if (IsAligned<HashWordType>(digest) && size%sizeof(HashWordType)==0)
+    if (IsAligned<HashWordType, false>(digest) && 
size%sizeof(HashWordType)==0)
         ConditionalByteReverse<HashWordType>(order, (HashWordType 
*)digest, stateBuf, size);
     else
     {
diff --git a/iterhash.h b/iterhash.h
index cce9e82..e83afec 100644
--- a/iterhash.h
+++ b/iterhash.h
@@ -25,7 +25,7 @@ public:
 
     IteratedHashBase() : m_countLo(0), m_countHi(0) {}
     unsigned int OptimalBlockSize() const {return this->BlockSize();}
-    unsigned int OptimalDataAlignment() const {return GetAlignmentOf<T>();}
+    unsigned int OptimalDataAlignment() const {return GetAlignmentOf<T, 
false>();}
     void Update(const byte *input, size_t length);
     byte * CreateUpdateSpace(size_t &size);
     void Restart();
diff --git a/misc.cpp b/misc.cpp
index 3c2c2a5..908578b 100644
--- a/misc.cpp
+++ b/misc.cpp
@@ -14,14 +14,22 @@
 
 NAMESPACE_BEGIN(CryptoPP)
 
+// GCC 4.7, 4.9 and 5.1 generates code using vmovdqa instructions under 
-O3, which cause a segfault due to 128-bit word alignment requirements.
+// Refer to http://stackoverflow.com/q/31373765 and 
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=66852 for more details.
+// #pragma GCC push_options
+// #pragma GCC optimize ("-O2")
+
 void xorbuf(byte *buf, const byte *mask, size_t count)
 {
     size_t i;
 
-    if (IsAligned<word32>(buf) && IsAligned<word32>(mask))
+    if (IsAligned<word32, false>(buf) && IsAligned<word32, false>(mask))
     {
-        if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(buf) && 
IsAligned<word64>(mask))
+        // GCC 4.8 and above vectorizes the XOR at -O3, so the data must 
be aligned.
+        if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(buf) && 
IsAligned<word64, true>(mask))
         {
+            assert(IsAlignedOn(buf, 8)); assert(IsAlignedOn(mask, 8));
+
             for (i=0; i<count/8; i++)
                 ((word64*)buf)[i] ^= ((word64*)mask)[i];
             count -= 8*i;
@@ -44,14 +52,18 @@ void xorbuf(byte *buf, const byte *mask, size_t count)
         buf[i] ^= mask[i];
 }
 
+// #pragma GCC pop_options
+
 void xorbuf(byte *output, const byte *input, const byte *mask, size_t 
count)
 {
     size_t i;
 
-    if (IsAligned<word32>(output) && IsAligned<word32>(input) && 
IsAligned<word32>(mask))
+    if (IsAligned<word32, false>(output) && IsAligned<word32, 
false>(input) && IsAligned<word32, false>(mask))
     {
-        if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(output) && 
IsAligned<word64>(input) && IsAligned<word64>(mask))
+        // GCC 4.8 and above vectorizes the XOR at -O3, so the data must 
be aligned.
+        if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(output) 
&& IsAligned<word64, true>(input) && IsAligned<word64, true>(mask))
         {
+            assert(IsAlignedOn(output, 8)); assert(IsAlignedOn(input, 8)); 
assert(IsAlignedOn(input, 8));
             for (i=0; i<count/8; i++)
                 ((word64*)output)[i] = ((word64*)input)[i] ^ 
((word64*)mask)[i];
             count -= 8*i;
@@ -81,11 +93,14 @@ bool VerifyBufsEqual(const byte *buf, const byte *mask, 
size_t count)
     size_t i;
     byte acc8 = 0;
 
-    if (IsAligned<word32>(buf) && IsAligned<word32>(mask))
+    if (IsAligned<word32, false>(buf) && IsAligned<word32, false>(mask))
     {
         word32 acc32 = 0;
-        if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(buf) && 
IsAligned<word64>(mask))
+        // GCC 4.8 and above could vectorize the XOR at -O3, so the data 
must be aligned.
+        if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(buf) && 
IsAligned<word64, true>(mask))
         {
+            assert(IsAlignedOn(buf, 8)); assert(IsAlignedOn(mask, 8));
+
             word64 acc64 = 0;
             for (i=0; i<count/8; i++)
                 acc64 |= ((word64*)buf)[i] ^ ((word64*)mask)[i];
diff --git a/misc.h b/misc.h
index 9b25ee5..9293490 100644
--- a/misc.h
+++ b/misc.h
@@ -46,7 +46,7 @@
 #define GCC_DIAGNOSTIC_AWARE ((__GNUC__ > 4 || (__GNUC__ == 4 && 
__GNUC_MINOR__ >= 7)) || defined(__clang__))
 
 // Used to manage function-level optimizations when working around 
compiler issues.
-//   At -O3, GCC vectorizes and uses SSE instructions, even if alignment 
does not meet instruction requirements.
+//   At -O3, GCC vectorizes uses SSE instructions, even if alignment does 
not meet instruction requirements.
 #define GCC_OPTIMIZE_AWARE ((__GNUC__ > 4 || (__GNUC__ == 4 && 
__GNUC_MINOR__ >= 7)) || defined(__clang__))
 
 #if GCC_DIAGNOSTIC_AWARE
@@ -387,11 +387,14 @@ inline T1 RoundUpToMultipleOf(const T1 &n, const T2 
&m)
     return RoundDownToMultipleOf(n+m-1, m);
 }
 
-template <class T>
+// Set STRICT=true when the compiler vectorizes the data. This is hit or 
miss,
+//   but we know it happens with GCC 4.8 and above with -O3 in misc.cpp, 
xorbuf().
+//   We can't provide STRICT=false by default because its a C++11 
extension.
+template <class T, bool STRICT>
 inline unsigned int GetAlignmentOf(T *dummy=NULL)    // VC60 workaround
 {
 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
-    if (sizeof(T) < 16)
+    if (!STRICT && sizeof(T) < 16)
         return 1;
 #endif
 
@@ -411,10 +414,13 @@ inline bool IsAlignedOn(const void *p, unsigned int 
alignment)
     return alignment==1 || (IsPowerOf2(alignment) ? ModPowerOf2((size_t)p, 
alignment) == 0 : (size_t)p % alignment == 0);
 }
 
-template <class T>
+// Set STRICT=true when the compiler vectorizes the data. This is hit or 
miss,
+//   but we know it happens with GCC 4.8 and above with -O3 in misc.cpp, 
xorbuf().
+//   We can't provide STRICT=false by default because its a C++11 
extension.
+template <class T, bool STRICT>
 inline bool IsAligned(const void *p, T *dummy=NULL)    // VC60 workaround
 {
-    return IsAlignedOn(p, GetAlignmentOf<T>());
+    return IsAlignedOn(p, GetAlignmentOf<T, STRICT>());
 }
 
 #ifdef IS_LITTLE_ENDIAN
@@ -562,11 +568,11 @@ template<> inline void SecureWipeBuffer(word64 *buf, 
size_t n)
 template <class T>
 inline void SecureWipeArray(T *buf, size_t n)
 {
-    if (sizeof(T) % 8 == 0 && GetAlignmentOf<T>() % 
GetAlignmentOf<word64>() == 0)
+    if (sizeof(T) % 8 == 0 && GetAlignmentOf<T, false>() % 
GetAlignmentOf<word64, false>() == 0)
         SecureWipeBuffer((word64 *)buf, n * (sizeof(T)/8));
-    else if (sizeof(T) % 4 == 0 && GetAlignmentOf<T>() % 
GetAlignmentOf<word32>() == 0)
+    else if (sizeof(T) % 4 == 0 && GetAlignmentOf<T, false>() % 
GetAlignmentOf<word32, false>() == 0)
         SecureWipeBuffer((word32 *)buf, n * (sizeof(T)/4));
-    else if (sizeof(T) % 2 == 0 && GetAlignmentOf<T>() % 
GetAlignmentOf<word16>() == 0)
+    else if (sizeof(T) % 2 == 0 && GetAlignmentOf<T, false>() % 
GetAlignmentOf<word16, false>() == 0)
         SecureWipeBuffer((word16 *)buf, n * (sizeof(T)/2));
     else
         SecureWipeBuffer((byte *)buf, n * sizeof(T));
diff --git a/panama.cpp b/panama.cpp
index 40c619f..14cc4bf 100644
--- a/panama.cpp
+++ b/panama.cpp
@@ -451,7 +451,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte 
*keystreamBuffer, const byt
     assert(length==32);
     this->Reset();
     this->Iterate(1, m_key);
-    if (iv && IsAligned<word32>(iv))
+    if (iv && IsAligned<word32, false>(iv))
         this->Iterate(1, (const word32 *)iv);
     else
     {
diff --git a/rc2.h b/rc2.h
index ebff798..dd78ef3 100644
--- a/rc2.h
+++ b/rc2.h
@@ -25,7 +25,7 @@ class RC2 : public RC2_Info, public 
BlockCipherDocumentation
     {
     public:
         void UncheckedSetKey(const byte *userKey, unsigned int length, 
const NameValuePairs &params);
-        unsigned int OptimalDataAlignment() const {return 
GetAlignmentOf<word16>();}
+        unsigned int OptimalDataAlignment() const {return 
GetAlignmentOf<word16, false>();}
 
     protected:
         FixedSizeSecBlock<word16, 64> K;  // expanded key table
diff --git a/salsa.cpp b/salsa.cpp
index aefe74c..b6dab21 100644
--- a/salsa.cpp
+++ b/salsa.cpp
@@ -60,7 +60,7 @@ unsigned int Salsa20_Policy::GetAlignment() const
         return 16;
     else
 #endif
-        return GetAlignmentOf<word32>();
+        return GetAlignmentOf<word32, false>();
 }
 
 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
diff --git a/sha3.h b/sha3.h
index 232bae5..b1dc2e2 100644
--- a/sha3.h
+++ b/sha3.h
@@ -15,7 +15,7 @@ public:
     SHA3(unsigned int digestSize) : m_digestSize(digestSize) {Restart();}
     unsigned int DigestSize() const {return m_digestSize;}
     std::string AlgorithmName() const {return "SHA-3-" + 
IntToString(m_digestSize*8);}
-    unsigned int OptimalDataAlignment() const {return 
GetAlignmentOf<word64>();}
+    unsigned int OptimalDataAlignment() const {return 
GetAlignmentOf<word64, true>();}
 
     void Update(const byte *input, size_t length);
     void Restart();
diff --git a/skipjack.h b/skipjack.h
index 6b12647..a533b79 100644
--- a/skipjack.h
+++ b/skipjack.h
@@ -22,7 +22,7 @@ class SKIPJACK : public SKIPJACK_Info, public 
BlockCipherDocumentation
     {
     public:
         void UncheckedSetKey(const byte *userKey, unsigned int length, 
const NameValuePairs &params);
-        unsigned int OptimalDataAlignment() const {return 
GetAlignmentOf<word16>();}
+        unsigned int OptimalDataAlignment() const {return 
GetAlignmentOf<word16, false>();}
 
     protected:
         static const byte fTable[256];
diff --git a/sosemanuk.cpp b/sosemanuk.cpp
index 0cc798f..6f13d0b 100644
--- a/sosemanuk.cpp
+++ b/sosemanuk.cpp
@@ -293,7 +293,7 @@ unsigned int SosemanukPolicy::GetAlignment() const
         return 16;
     else
 #endif
-        return GetAlignmentOf<word32>();
+        return GetAlignmentOf<word32, false>();
 }
 
 unsigned int SosemanukPolicy::GetOptimalBlockSize() const


-- 
-- 
You received this message because you are subscribed to the "Crypto++ Users" 
Google Group.
To unsubscribe, send an email to [email protected].
More information about Crypto++ and this group is available at 
http://www.cryptopp.com.
--- 
You received this message because you are subscribed to the Google Groups 
"Crypto++ Users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.
diff --git a/GNUmakefile b/GNUmakefile
index ca36725..045203b 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -1,6 +1,6 @@
 CXXFLAGS ?= -DNDEBUG
 SYMBOLS ?= -g2
-OPTIMIZE ?= -O2
+OPTIMIZE ?= -O3
 # -fPIC is supported, and enabled by default for x86_64.
 # the following options reduce code size, but breaks link or makes link very slow on some systems
 # CXXFLAGS += -ffunction-sections -fdata-sections
@@ -44,6 +44,11 @@ endif
 # Merge symbols and optimizations
 CXXFLAGS += $(SYMBOLS) $(OPTIMIZE)
 
+# GCC 4.8 and above vectorizes some operations, which creates alignment problems
+# ifeq ($(filter $(CXXFLAGS),-O3),-O3)
+# CXXFLAGS += -DCRYPTOPP_NO_UNALIGNED_DATA_ACCESS
+# endif
+
 ifeq ($(IS_X86),1)
 
 GCC42_OR_LATER = $(shell $(CXX) -v 2>&1 | $(EGREP) -c "^gcc version (4.[2-9]|[5-9])")
diff --git a/config.h b/config.h
index 7472ca0..21f6d4f 100644
--- a/config.h
+++ b/config.h
@@ -342,7 +342,7 @@ NAMESPACE_END
 	#define CRYPTOPP_BOOL_X86 0
 #endif
 
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 || defined(__powerpc__)
+#if !defined(CRYPTOPP_NO_UNALIGNED_DATA_ACCESS) && (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 || defined(__powerpc__))
 	#define CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
 #endif
 
diff --git a/crc.cpp b/crc.cpp
index 10c25c2..3b73c53 100644
--- a/crc.cpp
+++ b/crc.cpp
@@ -126,7 +126,7 @@ void CRC32::Update(const byte *s, size_t n)
 {
 	word32 crc = m_crc;
 
-	for(; !IsAligned<word32>(s) && n > 0; n--)
+	for(; !IsAligned<word32, false>(s) && n > 0; n--)
 		crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc);
 
 	while (n >= 4)
diff --git a/cryptlib.cpp b/cryptlib.cpp
index a9ed290..6683129 100644
--- a/cryptlib.cpp
+++ b/cryptlib.cpp
@@ -183,17 +183,17 @@ size_t BlockTransformation::AdvancedProcessBlocks(const byte *inBlocks, const by
 
 unsigned int BlockTransformation::OptimalDataAlignment() const
 {
-	return GetAlignmentOf<word32>();
+	return GetAlignmentOf<word32, false>();
 }
 
 unsigned int StreamTransformation::OptimalDataAlignment() const
 {
-	return GetAlignmentOf<word32>();
+	return GetAlignmentOf<word32, false>();
 }
 
 unsigned int HashTransformation::OptimalDataAlignment() const
 {
-	return GetAlignmentOf<word32>();
+	return GetAlignmentOf<word32, false>();
 }
 
 void StreamTransformation::ProcessLastBlock(byte *outString, const byte *inString, size_t length)
diff --git a/iterhash.cpp b/iterhash.cpp
index 1e31e9f..e9a812f 100644
--- a/iterhash.cpp
+++ b/iterhash.cpp
@@ -50,7 +50,7 @@ template <class T, class BASE> void IteratedHashBase<T, BASE>::Update(const byte
 			HashBlock(dataBuf);
 			return;
 		}
-		else if (IsAligned<T>(input))
+		else if (IsAligned<T, false>(input))
 		{
 			size_t leftOver = HashMultipleBlocks((T *)input, len);
 			input += (len - leftOver);
@@ -138,7 +138,7 @@ template <class T, class BASE> void IteratedHashBase<T, BASE>::TruncatedFinal(by
 
 	HashBlock(dataBuf);
 
-	if (IsAligned<HashWordType>(digest) && size%sizeof(HashWordType)==0)
+	if (IsAligned<HashWordType, false>(digest) && size%sizeof(HashWordType)==0)
 		ConditionalByteReverse<HashWordType>(order, (HashWordType *)digest, stateBuf, size);
 	else
 	{
diff --git a/iterhash.h b/iterhash.h
index cce9e82..e83afec 100644
--- a/iterhash.h
+++ b/iterhash.h
@@ -25,7 +25,7 @@ public:
 
 	IteratedHashBase() : m_countLo(0), m_countHi(0) {}
 	unsigned int OptimalBlockSize() const {return this->BlockSize();}
-	unsigned int OptimalDataAlignment() const {return GetAlignmentOf<T>();}
+	unsigned int OptimalDataAlignment() const {return GetAlignmentOf<T, false>();}
 	void Update(const byte *input, size_t length);
 	byte * CreateUpdateSpace(size_t &size);
 	void Restart();
diff --git a/misc.cpp b/misc.cpp
index 3c2c2a5..908578b 100644
--- a/misc.cpp
+++ b/misc.cpp
@@ -14,14 +14,22 @@
 
 NAMESPACE_BEGIN(CryptoPP)
 
+// GCC 4.7, 4.9 and 5.1 generates code using vmovdqa instructions under -O3, which cause a segfault due to 128-bit word alignment requirements.
+// Refer to http://stackoverflow.com/q/31373765 and http://gcc.gnu.org/bugzilla/show_bug.cgi?id=66852 for more details.
+// #pragma GCC push_options
+// #pragma GCC optimize ("-O2")
+
 void xorbuf(byte *buf, const byte *mask, size_t count)
 {
 	size_t i;
 
-	if (IsAligned<word32>(buf) && IsAligned<word32>(mask))
+	if (IsAligned<word32, false>(buf) && IsAligned<word32, false>(mask))
 	{
-		if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(buf) && IsAligned<word64>(mask))
+		// GCC 4.8 and above vectorizes the XOR at -O3, so the data must be aligned.
+		if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(buf) && IsAligned<word64, true>(mask))
 		{
+			assert(IsAlignedOn(buf, 8)); assert(IsAlignedOn(mask, 8));
+
 			for (i=0; i<count/8; i++)
 				((word64*)buf)[i] ^= ((word64*)mask)[i];
 			count -= 8*i;
@@ -44,14 +52,18 @@ void xorbuf(byte *buf, const byte *mask, size_t count)
 		buf[i] ^= mask[i];
 }
 
+// #pragma GCC pop_options
+
 void xorbuf(byte *output, const byte *input, const byte *mask, size_t count)
 {
 	size_t i;
 
-	if (IsAligned<word32>(output) && IsAligned<word32>(input) && IsAligned<word32>(mask))
+	if (IsAligned<word32, false>(output) && IsAligned<word32, false>(input) && IsAligned<word32, false>(mask))
 	{
-		if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(output) && IsAligned<word64>(input) && IsAligned<word64>(mask))
+		// GCC 4.8 and above vectorizes the XOR at -O3, so the data must be aligned.
+		if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(output) && IsAligned<word64, true>(input) && IsAligned<word64, true>(mask))
 		{
+			assert(IsAlignedOn(output, 8)); assert(IsAlignedOn(input, 8)); assert(IsAlignedOn(input, 8));
 			for (i=0; i<count/8; i++)
 				((word64*)output)[i] = ((word64*)input)[i] ^ ((word64*)mask)[i];
 			count -= 8*i;
@@ -81,11 +93,14 @@ bool VerifyBufsEqual(const byte *buf, const byte *mask, size_t count)
 	size_t i;
 	byte acc8 = 0;
 
-	if (IsAligned<word32>(buf) && IsAligned<word32>(mask))
+	if (IsAligned<word32, false>(buf) && IsAligned<word32, false>(mask))
 	{
 		word32 acc32 = 0;
-		if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(buf) && IsAligned<word64>(mask))
+		// GCC 4.8 and above could vectorize the XOR at -O3, so the data must be aligned.
+		if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(buf) && IsAligned<word64, true>(mask))
 		{
+			assert(IsAlignedOn(buf, 8)); assert(IsAlignedOn(mask, 8));
+
 			word64 acc64 = 0;
 			for (i=0; i<count/8; i++)
 				acc64 |= ((word64*)buf)[i] ^ ((word64*)mask)[i];
diff --git a/misc.h b/misc.h
index 9b25ee5..9293490 100644
--- a/misc.h
+++ b/misc.h
@@ -46,7 +46,7 @@
 #define GCC_DIAGNOSTIC_AWARE ((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) || defined(__clang__))
 
 // Used to manage function-level optimizations when working around compiler issues.
-//   At -O3, GCC vectorizes and uses SSE instructions, even if alignment does not meet instruction requirements.
+//   At -O3, GCC vectorizes uses SSE instructions, even if alignment does not meet instruction requirements.
 #define GCC_OPTIMIZE_AWARE ((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) || defined(__clang__))
 
 #if GCC_DIAGNOSTIC_AWARE
@@ -387,11 +387,14 @@ inline T1 RoundUpToMultipleOf(const T1 &n, const T2 &m)
 	return RoundDownToMultipleOf(n+m-1, m);
 }
 
-template <class T>
+// Set STRICT=true when the compiler vectorizes the data. This is hit or miss,
+//   but we know it happens with GCC 4.8 and above with -O3 in misc.cpp, xorbuf().
+//   We can't provide STRICT=false by default because its a C++11 extension.
+template <class T, bool STRICT>
 inline unsigned int GetAlignmentOf(T *dummy=NULL)	// VC60 workaround
 {
 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
-	if (sizeof(T) < 16)
+	if (!STRICT && sizeof(T) < 16)
 		return 1;
 #endif
 
@@ -411,10 +414,13 @@ inline bool IsAlignedOn(const void *p, unsigned int alignment)
 	return alignment==1 || (IsPowerOf2(alignment) ? ModPowerOf2((size_t)p, alignment) == 0 : (size_t)p % alignment == 0);
 }
 
-template <class T>
+// Set STRICT=true when the compiler vectorizes the data. This is hit or miss,
+//   but we know it happens with GCC 4.8 and above with -O3 in misc.cpp, xorbuf().
+//   We can't provide STRICT=false by default because its a C++11 extension.
+template <class T, bool STRICT>
 inline bool IsAligned(const void *p, T *dummy=NULL)	// VC60 workaround
 {
-	return IsAlignedOn(p, GetAlignmentOf<T>());
+	return IsAlignedOn(p, GetAlignmentOf<T, STRICT>());
 }
 
 #ifdef IS_LITTLE_ENDIAN
@@ -562,11 +568,11 @@ template<> inline void SecureWipeBuffer(word64 *buf, size_t n)
 template <class T>
 inline void SecureWipeArray(T *buf, size_t n)
 {
-	if (sizeof(T) % 8 == 0 && GetAlignmentOf<T>() % GetAlignmentOf<word64>() == 0)
+	if (sizeof(T) % 8 == 0 && GetAlignmentOf<T, false>() % GetAlignmentOf<word64, false>() == 0)
 		SecureWipeBuffer((word64 *)buf, n * (sizeof(T)/8));
-	else if (sizeof(T) % 4 == 0 && GetAlignmentOf<T>() % GetAlignmentOf<word32>() == 0)
+	else if (sizeof(T) % 4 == 0 && GetAlignmentOf<T, false>() % GetAlignmentOf<word32, false>() == 0)
 		SecureWipeBuffer((word32 *)buf, n * (sizeof(T)/4));
-	else if (sizeof(T) % 2 == 0 && GetAlignmentOf<T>() % GetAlignmentOf<word16>() == 0)
+	else if (sizeof(T) % 2 == 0 && GetAlignmentOf<T, false>() % GetAlignmentOf<word16, false>() == 0)
 		SecureWipeBuffer((word16 *)buf, n * (sizeof(T)/2));
 	else
 		SecureWipeBuffer((byte *)buf, n * sizeof(T));
diff --git a/panama.cpp b/panama.cpp
index 40c619f..14cc4bf 100644
--- a/panama.cpp
+++ b/panama.cpp
@@ -451,7 +451,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byt
 	assert(length==32);
 	this->Reset();
 	this->Iterate(1, m_key);
-	if (iv && IsAligned<word32>(iv))
+	if (iv && IsAligned<word32, false>(iv))
 		this->Iterate(1, (const word32 *)iv);
 	else
 	{
diff --git a/rc2.h b/rc2.h
index ebff798..dd78ef3 100644
--- a/rc2.h
+++ b/rc2.h
@@ -25,7 +25,7 @@ class RC2 : public RC2_Info, public BlockCipherDocumentation
 	{
 	public:
 		void UncheckedSetKey(const byte *userKey, unsigned int length, const NameValuePairs &params);
-		unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word16>();}
+		unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word16, false>();}
 
 	protected:
 		FixedSizeSecBlock<word16, 64> K;  // expanded key table
diff --git a/salsa.cpp b/salsa.cpp
index aefe74c..b6dab21 100644
--- a/salsa.cpp
+++ b/salsa.cpp
@@ -60,7 +60,7 @@ unsigned int Salsa20_Policy::GetAlignment() const
 		return 16;
 	else
 #endif
-		return GetAlignmentOf<word32>();
+		return GetAlignmentOf<word32, false>();
 }
 
 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
diff --git a/sha3.h b/sha3.h
index 232bae5..b1dc2e2 100644
--- a/sha3.h
+++ b/sha3.h
@@ -15,7 +15,7 @@ public:
 	SHA3(unsigned int digestSize) : m_digestSize(digestSize) {Restart();}
 	unsigned int DigestSize() const {return m_digestSize;}
 	std::string AlgorithmName() const {return "SHA-3-" + IntToString(m_digestSize*8);}
-	unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word64>();}
+	unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word64, true>();}
 
 	void Update(const byte *input, size_t length);
 	void Restart();
diff --git a/skipjack.h b/skipjack.h
index 6b12647..a533b79 100644
--- a/skipjack.h
+++ b/skipjack.h
@@ -22,7 +22,7 @@ class SKIPJACK : public SKIPJACK_Info, public BlockCipherDocumentation
 	{
 	public:
 		void UncheckedSetKey(const byte *userKey, unsigned int length, const NameValuePairs &params);
-		unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word16>();}
+		unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word16, false>();}
 
 	protected:
 		static const byte fTable[256];
diff --git a/sosemanuk.cpp b/sosemanuk.cpp
index 0cc798f..6f13d0b 100644
--- a/sosemanuk.cpp
+++ b/sosemanuk.cpp
@@ -293,7 +293,7 @@ unsigned int SosemanukPolicy::GetAlignment() const
 		return 16;
 	else
 #endif
-		return GetAlignmentOf<word32>();
+		return GetAlignmentOf<word32, false>();
 }
 
 unsigned int SosemanukPolicy::GetOptimalBlockSize() const

Reply via email to