https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64704
--- Comment #9 from Mikhail Maltsev <maltsevm at gmail dot com> --- >what can i do to make the ptr aligned by 16-byte. Well, you may skip first few bytes (of course not just discard them, but process one-by-one). Fortunately, you don't need to do it manually, it can be done by the compiler. The problem is that when you use a pointer to uint16, GCC assumes that it's already aligned by 2 byte boundary (if it's not true, the behavior is undefined). Consider this program: #include <stdint.h> #include <stdlib.h> #include <stdio.h> #include <linux/icmpv6.h> typedef uint8_t uint8; typedef uint16_t uint16; typedef uint32_t uint32; uint8 buf[1024] = { 0xFF, 0x01, 0x00, 0x02, 0x00 }; class MessageBuffer { public: MessageBuffer(uint8 *data, uint16 len) : data_(data), len_(len) { } uint16 getLength() { return len_ - 1; } uint16 __attribute__((noinline)) icmp6Checksum_ub (int update); uint16 __attribute__((noinline)) icmp6Checksum_naive (int update); uint8 __attribute__((noinline)) findPayloadType (void **payloadStart) { uint8 *p; asm volatile ("leaq 1(%0), %1" : "=r"(p) : "r"(data_) : ); /* p = data_ + 1; GCC will not use this information during tree optimization */ *payloadStart = p; return ICMPV6_ECHO_REQUEST; } private: uint8 *data_; uint16 len_; }; uint16 MessageBuffer::icmp6Checksum_ub(int) { register uint32 sum = 0xffff; struct icmp6_hdr *icmp6Ptr = NULL; uint8 type = findPayloadType((void**)&icmp6Ptr); (void)type; /* inhibit warning */ register int i; uint16 len = getLength(); register uint16 *ptr = (uint16 *)icmp6Ptr; for (i = 0; i < len - 1; i += 2) { sum += *ptr++; } return (sum); } uint16 MessageBuffer::icmp6Checksum_naive(int) { register uint32 sum = 0xffff; uint8 *data; findPayloadType((void**)&data); uint16 len = getLength(); for (int i = 0; i < len - 1; i += 2) { sum += data[i] | (data[i + 1] << 8); } return (sum); } int main() { MessageBuffer buffer(buf, 1000); printf("0x%.4x\n", buffer.icmp6Checksum_naive(0)); printf("0x%.4x\n", buffer.icmp6Checksum_ub(0)); } icmp6Checksum_naive calculates the checksum (I hope at least) and icmp6Checksum_ub causes segfault (I tried on g++ -O3 -funroll-loops -msse2, GCC 4.8.2). >i heard of that it is not necesary to aligned by 16-byte in x86 Maybe you confuse movdqa and movdqu (or some other instruction)? Here is a universal implementation from Linux kernel (there are also platform-specific versions): http://lxr.free-electrons.com/source/lib/checksum.c Notice that the case when address is odd is handled separately (especially in platform-specific code).