OK, so I had more time to look at the issue mentioned by Uri.
Below and attached is the patch I am proposing that fixes the issue by
removing some of the undefined behavior that caused the problem.
Comments and objections, please.
**********
It appears there's a bad interaction between
CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS, IsAligned(), IsAlignedOn(),
GetAlignmentOf(), OptimalDataAlignment() and the ability to use unaligned
data on i386 and x86_64. Since i386 and x86_64 allow unaligned data, we
effectively have this goodness:
template <class T>
inline unsigned int GetAlignmentOf(T *dummy=NULL) // VC60 workaround
{
#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
if (sizeof(T) < 16)
return 1;
#endif
#if (_MSC_VER >= 1300)
return __alignof(T);
#elif defined(__GNUC__)
return __alignof__(T);
#elif CRYPTOPP_BOOL_SLOW_WORD64
return UnsignedMin(4U, sizeof(T));
#else
return sizeof(T);
#endif
}
However, the logic does not account for the compiler generating SSE
instructions. And we can only guess when that's going to happen....
So, to fix this issue for now, we have the patch below. First, it makes
available CRYPTOPP_NO_UNALIGNED_DATA_ACCESS. This unconditionally stops the
problem. It also helps on other platforms, like early ARM and embedded
devices.
Second, it changes GetAlignmentOf to the follow. We make strict=true when
we know vectorization is occurring.
template <class T, bool STRICT>
inline unsigned int GetAlignmentOf(T *dummy=NULL)
With GetAlignmentOf fixed, IsAligned and IsAlignedOn falls into place:
template <class T, bool STRICT>
inline bool IsAligned(const void *p, T *dummy=NULL) {
return IsAlignedOn(p, GetAlignmentOf<T, STRICT>());
}
Third, we just need to fix the problems. We know SHA-3 is a problem due to
misc.cpp and xorbuf. So we make strict=true in them. Otherwise, we preserve
existing behavior and set strict=false.
diff --git a/GNUmakefile b/GNUmakefile
index ca36725..045203b 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -1,6 +1,6 @@
CXXFLAGS ?= -DNDEBUG
SYMBOLS ?= -g2
-OPTIMIZE ?= -O2
+OPTIMIZE ?= -O3
# -fPIC is supported, and enabled by default for x86_64.
# the following options reduce code size, but breaks link or makes link
very slow on some systems
# CXXFLAGS += -ffunction-sections -fdata-sections
@@ -44,6 +44,11 @@ endif
# Merge symbols and optimizations
CXXFLAGS += $(SYMBOLS) $(OPTIMIZE)
+# GCC 4.8 and above vectorizes some operations, which creates alignment
problems
+# ifeq ($(filter $(CXXFLAGS),-O3),-O3)
+# CXXFLAGS += -DCRYPTOPP_NO_UNALIGNED_DATA_ACCESS
+# endif
+
ifeq ($(IS_X86),1)
GCC42_OR_LATER = $(shell $(CXX) -v 2>&1 | $(EGREP) -c "^gcc version
(4.[2-9]|[5-9])")
diff --git a/config.h b/config.h
index 7472ca0..21f6d4f 100644
--- a/config.h
+++ b/config.h
@@ -342,7 +342,7 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_X86 0
#endif
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 || defined(__powerpc__)
+#if !defined(CRYPTOPP_NO_UNALIGNED_DATA_ACCESS) && (CRYPTOPP_BOOL_X64 ||
CRYPTOPP_BOOL_X86 || defined(__powerpc__))
#define CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
#endif
diff --git a/crc.cpp b/crc.cpp
index 10c25c2..3b73c53 100644
--- a/crc.cpp
+++ b/crc.cpp
@@ -126,7 +126,7 @@ void CRC32::Update(const byte *s, size_t n)
{
word32 crc = m_crc;
- for(; !IsAligned<word32>(s) && n > 0; n--)
+ for(; !IsAligned<word32, false>(s) && n > 0; n--)
crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc);
while (n >= 4)
diff --git a/cryptlib.cpp b/cryptlib.cpp
index a9ed290..6683129 100644
--- a/cryptlib.cpp
+++ b/cryptlib.cpp
@@ -183,17 +183,17 @@ size_t
BlockTransformation::AdvancedProcessBlocks(const byte *inBlocks, const by
unsigned int BlockTransformation::OptimalDataAlignment() const
{
- return GetAlignmentOf<word32>();
+ return GetAlignmentOf<word32, false>();
}
unsigned int StreamTransformation::OptimalDataAlignment() const
{
- return GetAlignmentOf<word32>();
+ return GetAlignmentOf<word32, false>();
}
unsigned int HashTransformation::OptimalDataAlignment() const
{
- return GetAlignmentOf<word32>();
+ return GetAlignmentOf<word32, false>();
}
void StreamTransformation::ProcessLastBlock(byte *outString, const byte
*inString, size_t length)
diff --git a/iterhash.cpp b/iterhash.cpp
index 1e31e9f..e9a812f 100644
--- a/iterhash.cpp
+++ b/iterhash.cpp
@@ -50,7 +50,7 @@ template <class T, class BASE> void IteratedHashBase<T,
BASE>::Update(const byte
HashBlock(dataBuf);
return;
}
- else if (IsAligned<T>(input))
+ else if (IsAligned<T, false>(input))
{
size_t leftOver = HashMultipleBlocks((T *)input, len);
input += (len - leftOver);
@@ -138,7 +138,7 @@ template <class T, class BASE> void IteratedHashBase<T,
BASE>::TruncatedFinal(by
HashBlock(dataBuf);
- if (IsAligned<HashWordType>(digest) && size%sizeof(HashWordType)==0)
+ if (IsAligned<HashWordType, false>(digest) &&
size%sizeof(HashWordType)==0)
ConditionalByteReverse<HashWordType>(order, (HashWordType
*)digest, stateBuf, size);
else
{
diff --git a/iterhash.h b/iterhash.h
index cce9e82..e83afec 100644
--- a/iterhash.h
+++ b/iterhash.h
@@ -25,7 +25,7 @@ public:
IteratedHashBase() : m_countLo(0), m_countHi(0) {}
unsigned int OptimalBlockSize() const {return this->BlockSize();}
- unsigned int OptimalDataAlignment() const {return GetAlignmentOf<T>();}
+ unsigned int OptimalDataAlignment() const {return GetAlignmentOf<T,
false>();}
void Update(const byte *input, size_t length);
byte * CreateUpdateSpace(size_t &size);
void Restart();
diff --git a/misc.cpp b/misc.cpp
index 3c2c2a5..908578b 100644
--- a/misc.cpp
+++ b/misc.cpp
@@ -14,14 +14,22 @@
NAMESPACE_BEGIN(CryptoPP)
+// GCC 4.7, 4.9 and 5.1 generates code using vmovdqa instructions under
-O3, which cause a segfault due to 128-bit word alignment requirements.
+// Refer to http://stackoverflow.com/q/31373765 and
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=66852 for more details.
+// #pragma GCC push_options
+// #pragma GCC optimize ("-O2")
+
void xorbuf(byte *buf, const byte *mask, size_t count)
{
size_t i;
- if (IsAligned<word32>(buf) && IsAligned<word32>(mask))
+ if (IsAligned<word32, false>(buf) && IsAligned<word32, false>(mask))
{
- if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(buf) &&
IsAligned<word64>(mask))
+ // GCC 4.8 and above vectorizes the XOR at -O3, so the data must
be aligned.
+ if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(buf) &&
IsAligned<word64, true>(mask))
{
+ assert(IsAlignedOn(buf, 8)); assert(IsAlignedOn(mask, 8));
+
for (i=0; i<count/8; i++)
((word64*)buf)[i] ^= ((word64*)mask)[i];
count -= 8*i;
@@ -44,14 +52,18 @@ void xorbuf(byte *buf, const byte *mask, size_t count)
buf[i] ^= mask[i];
}
+// #pragma GCC pop_options
+
void xorbuf(byte *output, const byte *input, const byte *mask, size_t
count)
{
size_t i;
- if (IsAligned<word32>(output) && IsAligned<word32>(input) &&
IsAligned<word32>(mask))
+ if (IsAligned<word32, false>(output) && IsAligned<word32,
false>(input) && IsAligned<word32, false>(mask))
{
- if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(output) &&
IsAligned<word64>(input) && IsAligned<word64>(mask))
+ // GCC 4.8 and above vectorizes the XOR at -O3, so the data must
be aligned.
+ if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(output)
&& IsAligned<word64, true>(input) && IsAligned<word64, true>(mask))
{
+ assert(IsAlignedOn(output, 8)); assert(IsAlignedOn(input, 8));
assert(IsAlignedOn(input, 8));
for (i=0; i<count/8; i++)
((word64*)output)[i] = ((word64*)input)[i] ^
((word64*)mask)[i];
count -= 8*i;
@@ -81,11 +93,14 @@ bool VerifyBufsEqual(const byte *buf, const byte *mask,
size_t count)
size_t i;
byte acc8 = 0;
- if (IsAligned<word32>(buf) && IsAligned<word32>(mask))
+ if (IsAligned<word32, false>(buf) && IsAligned<word32, false>(mask))
{
word32 acc32 = 0;
- if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(buf) &&
IsAligned<word64>(mask))
+ // GCC 4.8 and above could vectorize the XOR at -O3, so the data
must be aligned.
+ if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(buf) &&
IsAligned<word64, true>(mask))
{
+ assert(IsAlignedOn(buf, 8)); assert(IsAlignedOn(mask, 8));
+
word64 acc64 = 0;
for (i=0; i<count/8; i++)
acc64 |= ((word64*)buf)[i] ^ ((word64*)mask)[i];
diff --git a/misc.h b/misc.h
index 9b25ee5..9293490 100644
--- a/misc.h
+++ b/misc.h
@@ -46,7 +46,7 @@
#define GCC_DIAGNOSTIC_AWARE ((__GNUC__ > 4 || (__GNUC__ == 4 &&
__GNUC_MINOR__ >= 7)) || defined(__clang__))
// Used to manage function-level optimizations when working around
compiler issues.
-// At -O3, GCC vectorizes and uses SSE instructions, even if alignment
does not meet instruction requirements.
+// At -O3, GCC vectorizes uses SSE instructions, even if alignment does
not meet instruction requirements.
#define GCC_OPTIMIZE_AWARE ((__GNUC__ > 4 || (__GNUC__ == 4 &&
__GNUC_MINOR__ >= 7)) || defined(__clang__))
#if GCC_DIAGNOSTIC_AWARE
@@ -387,11 +387,14 @@ inline T1 RoundUpToMultipleOf(const T1 &n, const T2
&m)
return RoundDownToMultipleOf(n+m-1, m);
}
-template <class T>
+// Set STRICT=true when the compiler vectorizes the data. This is hit or
miss,
+// but we know it happens with GCC 4.8 and above with -O3 in misc.cpp,
xorbuf().
+// We can't provide STRICT=false by default because its a C++11
extension.
+template <class T, bool STRICT>
inline unsigned int GetAlignmentOf(T *dummy=NULL) // VC60 workaround
{
#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
- if (sizeof(T) < 16)
+ if (!STRICT && sizeof(T) < 16)
return 1;
#endif
@@ -411,10 +414,13 @@ inline bool IsAlignedOn(const void *p, unsigned int
alignment)
return alignment==1 || (IsPowerOf2(alignment) ? ModPowerOf2((size_t)p,
alignment) == 0 : (size_t)p % alignment == 0);
}
-template <class T>
+// Set STRICT=true when the compiler vectorizes the data. This is hit or
miss,
+// but we know it happens with GCC 4.8 and above with -O3 in misc.cpp,
xorbuf().
+// We can't provide STRICT=false by default because its a C++11
extension.
+template <class T, bool STRICT>
inline bool IsAligned(const void *p, T *dummy=NULL) // VC60 workaround
{
- return IsAlignedOn(p, GetAlignmentOf<T>());
+ return IsAlignedOn(p, GetAlignmentOf<T, STRICT>());
}
#ifdef IS_LITTLE_ENDIAN
@@ -562,11 +568,11 @@ template<> inline void SecureWipeBuffer(word64 *buf,
size_t n)
template <class T>
inline void SecureWipeArray(T *buf, size_t n)
{
- if (sizeof(T) % 8 == 0 && GetAlignmentOf<T>() %
GetAlignmentOf<word64>() == 0)
+ if (sizeof(T) % 8 == 0 && GetAlignmentOf<T, false>() %
GetAlignmentOf<word64, false>() == 0)
SecureWipeBuffer((word64 *)buf, n * (sizeof(T)/8));
- else if (sizeof(T) % 4 == 0 && GetAlignmentOf<T>() %
GetAlignmentOf<word32>() == 0)
+ else if (sizeof(T) % 4 == 0 && GetAlignmentOf<T, false>() %
GetAlignmentOf<word32, false>() == 0)
SecureWipeBuffer((word32 *)buf, n * (sizeof(T)/4));
- else if (sizeof(T) % 2 == 0 && GetAlignmentOf<T>() %
GetAlignmentOf<word16>() == 0)
+ else if (sizeof(T) % 2 == 0 && GetAlignmentOf<T, false>() %
GetAlignmentOf<word16, false>() == 0)
SecureWipeBuffer((word16 *)buf, n * (sizeof(T)/2));
else
SecureWipeBuffer((byte *)buf, n * sizeof(T));
diff --git a/panama.cpp b/panama.cpp
index 40c619f..14cc4bf 100644
--- a/panama.cpp
+++ b/panama.cpp
@@ -451,7 +451,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte
*keystreamBuffer, const byt
assert(length==32);
this->Reset();
this->Iterate(1, m_key);
- if (iv && IsAligned<word32>(iv))
+ if (iv && IsAligned<word32, false>(iv))
this->Iterate(1, (const word32 *)iv);
else
{
diff --git a/rc2.h b/rc2.h
index ebff798..dd78ef3 100644
--- a/rc2.h
+++ b/rc2.h
@@ -25,7 +25,7 @@ class RC2 : public RC2_Info, public
BlockCipherDocumentation
{
public:
void UncheckedSetKey(const byte *userKey, unsigned int length,
const NameValuePairs ¶ms);
- unsigned int OptimalDataAlignment() const {return
GetAlignmentOf<word16>();}
+ unsigned int OptimalDataAlignment() const {return
GetAlignmentOf<word16, false>();}
protected:
FixedSizeSecBlock<word16, 64> K; // expanded key table
diff --git a/salsa.cpp b/salsa.cpp
index aefe74c..b6dab21 100644
--- a/salsa.cpp
+++ b/salsa.cpp
@@ -60,7 +60,7 @@ unsigned int Salsa20_Policy::GetAlignment() const
return 16;
else
#endif
- return GetAlignmentOf<word32>();
+ return GetAlignmentOf<word32, false>();
}
unsigned int Salsa20_Policy::GetOptimalBlockSize() const
diff --git a/sha3.h b/sha3.h
index 232bae5..b1dc2e2 100644
--- a/sha3.h
+++ b/sha3.h
@@ -15,7 +15,7 @@ public:
SHA3(unsigned int digestSize) : m_digestSize(digestSize) {Restart();}
unsigned int DigestSize() const {return m_digestSize;}
std::string AlgorithmName() const {return "SHA-3-" +
IntToString(m_digestSize*8);}
- unsigned int OptimalDataAlignment() const {return
GetAlignmentOf<word64>();}
+ unsigned int OptimalDataAlignment() const {return
GetAlignmentOf<word64, true>();}
void Update(const byte *input, size_t length);
void Restart();
diff --git a/skipjack.h b/skipjack.h
index 6b12647..a533b79 100644
--- a/skipjack.h
+++ b/skipjack.h
@@ -22,7 +22,7 @@ class SKIPJACK : public SKIPJACK_Info, public
BlockCipherDocumentation
{
public:
void UncheckedSetKey(const byte *userKey, unsigned int length,
const NameValuePairs ¶ms);
- unsigned int OptimalDataAlignment() const {return
GetAlignmentOf<word16>();}
+ unsigned int OptimalDataAlignment() const {return
GetAlignmentOf<word16, false>();}
protected:
static const byte fTable[256];
diff --git a/sosemanuk.cpp b/sosemanuk.cpp
index 0cc798f..6f13d0b 100644
--- a/sosemanuk.cpp
+++ b/sosemanuk.cpp
@@ -293,7 +293,7 @@ unsigned int SosemanukPolicy::GetAlignment() const
return 16;
else
#endif
- return GetAlignmentOf<word32>();
+ return GetAlignmentOf<word32, false>();
}
unsigned int SosemanukPolicy::GetOptimalBlockSize() const
--
--
You received this message because you are subscribed to the "Crypto++ Users"
Google Group.
To unsubscribe, send an email to [email protected].
More information about Crypto++ and this group is available at
http://www.cryptopp.com.
---
You received this message because you are subscribed to the Google Groups
"Crypto++ Users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/d/optout.
diff --git a/GNUmakefile b/GNUmakefile
index ca36725..045203b 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -1,6 +1,6 @@
CXXFLAGS ?= -DNDEBUG
SYMBOLS ?= -g2
-OPTIMIZE ?= -O2
+OPTIMIZE ?= -O3
# -fPIC is supported, and enabled by default for x86_64.
# the following options reduce code size, but breaks link or makes link very slow on some systems
# CXXFLAGS += -ffunction-sections -fdata-sections
@@ -44,6 +44,11 @@ endif
# Merge symbols and optimizations
CXXFLAGS += $(SYMBOLS) $(OPTIMIZE)
+# GCC 4.8 and above vectorizes some operations, which creates alignment problems
+# ifeq ($(filter $(CXXFLAGS),-O3),-O3)
+# CXXFLAGS += -DCRYPTOPP_NO_UNALIGNED_DATA_ACCESS
+# endif
+
ifeq ($(IS_X86),1)
GCC42_OR_LATER = $(shell $(CXX) -v 2>&1 | $(EGREP) -c "^gcc version (4.[2-9]|[5-9])")
diff --git a/config.h b/config.h
index 7472ca0..21f6d4f 100644
--- a/config.h
+++ b/config.h
@@ -342,7 +342,7 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_X86 0
#endif
-#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 || defined(__powerpc__)
+#if !defined(CRYPTOPP_NO_UNALIGNED_DATA_ACCESS) && (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 || defined(__powerpc__))
#define CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
#endif
diff --git a/crc.cpp b/crc.cpp
index 10c25c2..3b73c53 100644
--- a/crc.cpp
+++ b/crc.cpp
@@ -126,7 +126,7 @@ void CRC32::Update(const byte *s, size_t n)
{
word32 crc = m_crc;
- for(; !IsAligned<word32>(s) && n > 0; n--)
+ for(; !IsAligned<word32, false>(s) && n > 0; n--)
crc = m_tab[CRC32_INDEX(crc) ^ *s++] ^ CRC32_SHIFTED(crc);
while (n >= 4)
diff --git a/cryptlib.cpp b/cryptlib.cpp
index a9ed290..6683129 100644
--- a/cryptlib.cpp
+++ b/cryptlib.cpp
@@ -183,17 +183,17 @@ size_t BlockTransformation::AdvancedProcessBlocks(const byte *inBlocks, const by
unsigned int BlockTransformation::OptimalDataAlignment() const
{
- return GetAlignmentOf<word32>();
+ return GetAlignmentOf<word32, false>();
}
unsigned int StreamTransformation::OptimalDataAlignment() const
{
- return GetAlignmentOf<word32>();
+ return GetAlignmentOf<word32, false>();
}
unsigned int HashTransformation::OptimalDataAlignment() const
{
- return GetAlignmentOf<word32>();
+ return GetAlignmentOf<word32, false>();
}
void StreamTransformation::ProcessLastBlock(byte *outString, const byte *inString, size_t length)
diff --git a/iterhash.cpp b/iterhash.cpp
index 1e31e9f..e9a812f 100644
--- a/iterhash.cpp
+++ b/iterhash.cpp
@@ -50,7 +50,7 @@ template <class T, class BASE> void IteratedHashBase<T, BASE>::Update(const byte
HashBlock(dataBuf);
return;
}
- else if (IsAligned<T>(input))
+ else if (IsAligned<T, false>(input))
{
size_t leftOver = HashMultipleBlocks((T *)input, len);
input += (len - leftOver);
@@ -138,7 +138,7 @@ template <class T, class BASE> void IteratedHashBase<T, BASE>::TruncatedFinal(by
HashBlock(dataBuf);
- if (IsAligned<HashWordType>(digest) && size%sizeof(HashWordType)==0)
+ if (IsAligned<HashWordType, false>(digest) && size%sizeof(HashWordType)==0)
ConditionalByteReverse<HashWordType>(order, (HashWordType *)digest, stateBuf, size);
else
{
diff --git a/iterhash.h b/iterhash.h
index cce9e82..e83afec 100644
--- a/iterhash.h
+++ b/iterhash.h
@@ -25,7 +25,7 @@ public:
IteratedHashBase() : m_countLo(0), m_countHi(0) {}
unsigned int OptimalBlockSize() const {return this->BlockSize();}
- unsigned int OptimalDataAlignment() const {return GetAlignmentOf<T>();}
+ unsigned int OptimalDataAlignment() const {return GetAlignmentOf<T, false>();}
void Update(const byte *input, size_t length);
byte * CreateUpdateSpace(size_t &size);
void Restart();
diff --git a/misc.cpp b/misc.cpp
index 3c2c2a5..908578b 100644
--- a/misc.cpp
+++ b/misc.cpp
@@ -14,14 +14,22 @@
NAMESPACE_BEGIN(CryptoPP)
+// GCC 4.7, 4.9 and 5.1 generates code using vmovdqa instructions under -O3, which cause a segfault due to 128-bit word alignment requirements.
+// Refer to http://stackoverflow.com/q/31373765 and http://gcc.gnu.org/bugzilla/show_bug.cgi?id=66852 for more details.
+// #pragma GCC push_options
+// #pragma GCC optimize ("-O2")
+
void xorbuf(byte *buf, const byte *mask, size_t count)
{
size_t i;
- if (IsAligned<word32>(buf) && IsAligned<word32>(mask))
+ if (IsAligned<word32, false>(buf) && IsAligned<word32, false>(mask))
{
- if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(buf) && IsAligned<word64>(mask))
+ // GCC 4.8 and above vectorizes the XOR at -O3, so the data must be aligned.
+ if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(buf) && IsAligned<word64, true>(mask))
{
+ assert(IsAlignedOn(buf, 8)); assert(IsAlignedOn(mask, 8));
+
for (i=0; i<count/8; i++)
((word64*)buf)[i] ^= ((word64*)mask)[i];
count -= 8*i;
@@ -44,14 +52,18 @@ void xorbuf(byte *buf, const byte *mask, size_t count)
buf[i] ^= mask[i];
}
+// #pragma GCC pop_options
+
void xorbuf(byte *output, const byte *input, const byte *mask, size_t count)
{
size_t i;
- if (IsAligned<word32>(output) && IsAligned<word32>(input) && IsAligned<word32>(mask))
+ if (IsAligned<word32, false>(output) && IsAligned<word32, false>(input) && IsAligned<word32, false>(mask))
{
- if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(output) && IsAligned<word64>(input) && IsAligned<word64>(mask))
+ // GCC 4.8 and above vectorizes the XOR at -O3, so the data must be aligned.
+ if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(output) && IsAligned<word64, true>(input) && IsAligned<word64, true>(mask))
{
+ assert(IsAlignedOn(output, 8)); assert(IsAlignedOn(input, 8)); assert(IsAlignedOn(input, 8));
for (i=0; i<count/8; i++)
((word64*)output)[i] = ((word64*)input)[i] ^ ((word64*)mask)[i];
count -= 8*i;
@@ -81,11 +93,14 @@ bool VerifyBufsEqual(const byte *buf, const byte *mask, size_t count)
size_t i;
byte acc8 = 0;
- if (IsAligned<word32>(buf) && IsAligned<word32>(mask))
+ if (IsAligned<word32, false>(buf) && IsAligned<word32, false>(mask))
{
word32 acc32 = 0;
- if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64>(buf) && IsAligned<word64>(mask))
+ // GCC 4.8 and above could vectorize the XOR at -O3, so the data must be aligned.
+ if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned<word64, true>(buf) && IsAligned<word64, true>(mask))
{
+ assert(IsAlignedOn(buf, 8)); assert(IsAlignedOn(mask, 8));
+
word64 acc64 = 0;
for (i=0; i<count/8; i++)
acc64 |= ((word64*)buf)[i] ^ ((word64*)mask)[i];
diff --git a/misc.h b/misc.h
index 9b25ee5..9293490 100644
--- a/misc.h
+++ b/misc.h
@@ -46,7 +46,7 @@
#define GCC_DIAGNOSTIC_AWARE ((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) || defined(__clang__))
// Used to manage function-level optimizations when working around compiler issues.
-// At -O3, GCC vectorizes and uses SSE instructions, even if alignment does not meet instruction requirements.
+// At -O3, GCC vectorizes uses SSE instructions, even if alignment does not meet instruction requirements.
#define GCC_OPTIMIZE_AWARE ((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) || defined(__clang__))
#if GCC_DIAGNOSTIC_AWARE
@@ -387,11 +387,14 @@ inline T1 RoundUpToMultipleOf(const T1 &n, const T2 &m)
return RoundDownToMultipleOf(n+m-1, m);
}
-template <class T>
+// Set STRICT=true when the compiler vectorizes the data. This is hit or miss,
+// but we know it happens with GCC 4.8 and above with -O3 in misc.cpp, xorbuf().
+// We can't provide STRICT=false by default because its a C++11 extension.
+template <class T, bool STRICT>
inline unsigned int GetAlignmentOf(T *dummy=NULL) // VC60 workaround
{
#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
- if (sizeof(T) < 16)
+ if (!STRICT && sizeof(T) < 16)
return 1;
#endif
@@ -411,10 +414,13 @@ inline bool IsAlignedOn(const void *p, unsigned int alignment)
return alignment==1 || (IsPowerOf2(alignment) ? ModPowerOf2((size_t)p, alignment) == 0 : (size_t)p % alignment == 0);
}
-template <class T>
+// Set STRICT=true when the compiler vectorizes the data. This is hit or miss,
+// but we know it happens with GCC 4.8 and above with -O3 in misc.cpp, xorbuf().
+// We can't provide STRICT=false by default because its a C++11 extension.
+template <class T, bool STRICT>
inline bool IsAligned(const void *p, T *dummy=NULL) // VC60 workaround
{
- return IsAlignedOn(p, GetAlignmentOf<T>());
+ return IsAlignedOn(p, GetAlignmentOf<T, STRICT>());
}
#ifdef IS_LITTLE_ENDIAN
@@ -562,11 +568,11 @@ template<> inline void SecureWipeBuffer(word64 *buf, size_t n)
template <class T>
inline void SecureWipeArray(T *buf, size_t n)
{
- if (sizeof(T) % 8 == 0 && GetAlignmentOf<T>() % GetAlignmentOf<word64>() == 0)
+ if (sizeof(T) % 8 == 0 && GetAlignmentOf<T, false>() % GetAlignmentOf<word64, false>() == 0)
SecureWipeBuffer((word64 *)buf, n * (sizeof(T)/8));
- else if (sizeof(T) % 4 == 0 && GetAlignmentOf<T>() % GetAlignmentOf<word32>() == 0)
+ else if (sizeof(T) % 4 == 0 && GetAlignmentOf<T, false>() % GetAlignmentOf<word32, false>() == 0)
SecureWipeBuffer((word32 *)buf, n * (sizeof(T)/4));
- else if (sizeof(T) % 2 == 0 && GetAlignmentOf<T>() % GetAlignmentOf<word16>() == 0)
+ else if (sizeof(T) % 2 == 0 && GetAlignmentOf<T, false>() % GetAlignmentOf<word16, false>() == 0)
SecureWipeBuffer((word16 *)buf, n * (sizeof(T)/2));
else
SecureWipeBuffer((byte *)buf, n * sizeof(T));
diff --git a/panama.cpp b/panama.cpp
index 40c619f..14cc4bf 100644
--- a/panama.cpp
+++ b/panama.cpp
@@ -451,7 +451,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byt
assert(length==32);
this->Reset();
this->Iterate(1, m_key);
- if (iv && IsAligned<word32>(iv))
+ if (iv && IsAligned<word32, false>(iv))
this->Iterate(1, (const word32 *)iv);
else
{
diff --git a/rc2.h b/rc2.h
index ebff798..dd78ef3 100644
--- a/rc2.h
+++ b/rc2.h
@@ -25,7 +25,7 @@ class RC2 : public RC2_Info, public BlockCipherDocumentation
{
public:
void UncheckedSetKey(const byte *userKey, unsigned int length, const NameValuePairs ¶ms);
- unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word16>();}
+ unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word16, false>();}
protected:
FixedSizeSecBlock<word16, 64> K; // expanded key table
diff --git a/salsa.cpp b/salsa.cpp
index aefe74c..b6dab21 100644
--- a/salsa.cpp
+++ b/salsa.cpp
@@ -60,7 +60,7 @@ unsigned int Salsa20_Policy::GetAlignment() const
return 16;
else
#endif
- return GetAlignmentOf<word32>();
+ return GetAlignmentOf<word32, false>();
}
unsigned int Salsa20_Policy::GetOptimalBlockSize() const
diff --git a/sha3.h b/sha3.h
index 232bae5..b1dc2e2 100644
--- a/sha3.h
+++ b/sha3.h
@@ -15,7 +15,7 @@ public:
SHA3(unsigned int digestSize) : m_digestSize(digestSize) {Restart();}
unsigned int DigestSize() const {return m_digestSize;}
std::string AlgorithmName() const {return "SHA-3-" + IntToString(m_digestSize*8);}
- unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word64>();}
+ unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word64, true>();}
void Update(const byte *input, size_t length);
void Restart();
diff --git a/skipjack.h b/skipjack.h
index 6b12647..a533b79 100644
--- a/skipjack.h
+++ b/skipjack.h
@@ -22,7 +22,7 @@ class SKIPJACK : public SKIPJACK_Info, public BlockCipherDocumentation
{
public:
void UncheckedSetKey(const byte *userKey, unsigned int length, const NameValuePairs ¶ms);
- unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word16>();}
+ unsigned int OptimalDataAlignment() const {return GetAlignmentOf<word16, false>();}
protected:
static const byte fTable[256];
diff --git a/sosemanuk.cpp b/sosemanuk.cpp
index 0cc798f..6f13d0b 100644
--- a/sosemanuk.cpp
+++ b/sosemanuk.cpp
@@ -293,7 +293,7 @@ unsigned int SosemanukPolicy::GetAlignment() const
return 16;
else
#endif
- return GetAlignmentOf<word32>();
+ return GetAlignmentOf<word32, false>();
}
unsigned int SosemanukPolicy::GetOptimalBlockSize() const