gcc-4.3.3 (4.3.2 probably other versions, too) seems to produce bad code when accessing an array of small 'volatile' objects -- it may try to access multiple such objects in a 'parallel/vectorized' fashion.
-ftree-vectorize seems to be the option that triggers this behavior. E.g., instead of reading four consecutive 'volatile char's sequentially it reads a single 32-bit longword (on x86_64 it even uses XMM registers to read 16 volatile chars at once). This may crash e.g., when accessing a memory-mapped device which allows only 8-bit accesses. The vectorized access is preceded by a single, redundant access of the correct width implementing something like for (i=0; i<16/vector_size; i++) { d[i] = s[i]; ((vector char*)d)[i] = ((vector char*)s)[i]; } Look for the statements after '.L6' which are executed if source and destination are 4-byte (16-byte in case of XMM) aligned and if src/dst vectors don't overlap. Both facts (redundant access + vectorized access) seem to violate C99 (5.1.2.3.2): "At certain specified points in the execution sequence called 'sequence points', all side effects of previous evaluations shall be complete and no side effects of subsequent evaluations shall have taken place" This rule seems violated at the ';' sequence point which terminates the assignment. void volarr_cpy(char *d, volatile char *s) { int i; for ( i=0; i<16; i++ ) d[i]=s[i]; } compiled for i386 with gcc-4.3.3 -m32 -O3 -S -c .file "volcharr_cpy.c" .text .p2align 4,,15 .globl volarr_cpy .type volarr_cpy, @function volarr_cpy: pushl %ebp movl %esp, %ebp pushl %esi pushl %ebx movl 12(%ebp), %ecx movl 8(%ebp), %edx movl %ecx, %eax orl %edx, %eax testb $3, %al leal 4(%ecx), %ebx leal 4(%edx), %esi je .L8 .L2: movzbl (%ecx), %eax movb %al, (%edx) movzbl 1(%ecx), %eax movb %al, 1(%edx) movzbl 2(%ecx), %eax movb %al, 2(%edx) movzbl 3(%ecx), %eax movb %al, 3(%edx) movzbl 4(%ecx), %eax movb %al, 4(%edx) movzbl 5(%ecx), %eax movb %al, 5(%edx) movzbl 6(%ecx), %eax movb %al, 6(%edx) movzbl 7(%ecx), %eax movb %al, 7(%edx) movzbl 8(%ecx), %eax movb %al, 8(%edx) movzbl 9(%ecx), %eax movb %al, 9(%edx) movzbl 10(%ecx), %eax movb %al, 10(%edx) movzbl 11(%ecx), %eax movb %al, 11(%edx) movzbl 12(%ecx), %eax movb %al, 12(%edx) movzbl 13(%ecx), %eax movb %al, 13(%edx) movzbl 14(%ecx), %eax movb %al, 14(%edx) movzbl 15(%ecx), %eax movb %al, 15(%edx) popl %ebx popl %esi popl %ebp ret .p2align 4,,7 .p2align 3 .L8: cmpl %ebx, %edx jbe .L9 .L6: movzbl (%ecx), %eax movl (%ecx), %eax movl %eax, (%edx) movzbl 1(%ecx), %eax movl 4(%ecx), %eax movl %eax, 4(%edx) movzbl 2(%ecx), %eax leal 4(%ebx), %edx movl 4(%ebx), %eax movl %eax, 4(%esi) movzbl 3(%ecx), %eax movl 4(%edx), %eax movl %eax, 8(%esi) popl %ebx popl %esi popl %ebp ret .p2align 4,,7 .p2align 3 .L9: cmpl %esi, %ecx jbe .L2 jmp .L6 .size volarr_cpy, .-volarr_cpy .ident "GCC: (Ubuntu 4.3.3-5ubuntu4) 4.3.3" .section .note.GNU-stack,"",@progbits compiled for x86_64 with gcc -S -c -O3 .file "volcharr_cpy.c" .text .p2align 4,,15 .globl volarr_cpy .type volarr_cpy, @function volarr_cpy: .LFB2: testb $15, %dil je .L8 .L2: movzbl (%rsi), %eax movb %al, (%rdi) movzbl 1(%rsi), %eax movb %al, 1(%rdi) movzbl 2(%rsi), %eax movb %al, 2(%rdi) movzbl 3(%rsi), %eax movb %al, 3(%rdi) movzbl 4(%rsi), %eax movb %al, 4(%rdi) movzbl 5(%rsi), %eax movb %al, 5(%rdi) movzbl 6(%rsi), %eax movb %al, 6(%rdi) movzbl 7(%rsi), %eax movb %al, 7(%rdi) movzbl 8(%rsi), %eax movb %al, 8(%rdi) movzbl 9(%rsi), %eax movb %al, 9(%rdi) movzbl 10(%rsi), %eax movb %al, 10(%rdi) movzbl 11(%rsi), %eax movb %al, 11(%rdi) movzbl 12(%rsi), %eax movb %al, 12(%rdi) movzbl 13(%rsi), %eax movb %al, 13(%rdi) movzbl 14(%rsi), %eax movb %al, 14(%rdi) movzbl 15(%rsi), %eax movb %al, 15(%rdi) ret .p2align 4,,10 .p2align 3 .L8: leaq 16(%rsi), %rax cmpq %rax, %rdi jbe .L9 .L6: movzbl (%rsi), %eax movdqu (%rsi), %xmm0 movdqa %xmm0, (%rdi) ret .p2align 4,,10 .p2align 3 .L9: leaq 16(%rdi), %rax cmpq %rax, %rsi jbe .L2 .p2align 4,,2 .p2align 3 jmp .L6 .LFE2: .size volarr_cpy, .-volarr_cpy .section .eh_frame,"a",@progbits .Lframe1: .long .LECIE1-.LSCIE1 .LSCIE1: .long 0x0 .byte 0x1 .string "zR" .uleb128 0x1 .sleb128 -8 .byte 0x10 .uleb128 0x1 .byte 0x3 .byte 0xc .uleb128 0x7 .uleb128 0x8 .byte 0x90 .uleb128 0x1 .align 8 .LECIE1: .LSFDE1: .long .LEFDE1-.LASFDE1 .LASFDE1: .long .LASFDE1-.Lframe1 .long .LFB2 .long .LFE2-.LFB2 .uleb128 0x0 .align 8 .LEFDE1: .ident "GCC: (Ubuntu 4.3.3-5ubuntu4) 4.3.3" .section .note.GNU-stack,"",@progbits compiled for powerpc with powerpc-rtems-gcc -S -c -O3 .file "volcharr_cpy.c" .gnu_attribute 4, 1 .gnu_attribute 8, 1 .section ".text" .align 2 .globl volarr_cpy .type volarr_cpy, @function volarr_cpy: or 0,4,3 addi 11,4,4 andi. 9,0,3 addi 10,3,4 beq- 0,.L8 .L2: lbz 0,0(4) stb 0,0(3) lbz 9,1(4) stb 9,1(3) lbz 0,2(4) stb 0,2(3) lbz 9,3(4) stb 9,3(3) lbz 0,4(4) stb 0,4(3) lbz 9,5(4) stb 9,5(3) lbz 0,6(4) stb 0,6(3) lbz 9,7(4) stb 9,7(3) lbz 0,8(4) stb 0,8(3) lbz 9,9(4) stb 9,9(3) lbz 0,10(4) stb 0,10(3) lbz 9,11(4) stb 9,11(3) lbz 0,12(4) stb 0,12(3) lbz 9,13(4) stb 9,13(3) lbz 0,14(4) stb 0,14(3) lbz 9,15(4) stb 9,15(3) blr .L8: cmplw 7,3,11 ble- 7,.L9 .L6: lbz 0,0(4) addi 9,11,4 lwz 0,0(4) stw 0,0(3) lbz 0,1(4) lwz 0,4(4) stw 0,4(3) lbz 0,2(4) lwz 0,4(11) stw 0,4(10) lbz 0,3(4) lwz 0,4(9) stw 0,8(10) blr .L9: cmplw 7,4,10 ble- 7,.L2 b .L6 .size volarr_cpy, .-volarr_cpy .ident "GCC: (GNU) 4.3.2" -- Summary: gcc-4.3.3 vectorizes access to volatile array Product: gcc Version: 4.3.3 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: strauman at slac dot stanford dot edu GCC build triplet: x86_64-unkown-linux GCC host triplet: x86_64-unknown-linux GCC target triplet: x86_64-unknown-linux, i386-unknown-linux, i386-unknown- rtems, po http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40542