On 01/23/2013 07:05 PM, Amos Jeffries wrote:
> On 24/01/2013 7:20 a.m., Kinkie wrote:
>> the attached patch turns the unsigned int:1 flags in CachePeer to
>> bools.
> Please retain the :1 bitmasking. My microbench is showing a consistent
> ~50ms speed gain on bitmasks over full bool, particularly when there are
> multiple bools in the structure. We also get some useful object size gains.
Hello,
FYI: With g++ -O3, there is no measureable performance difference
between bool and bool:1 in my primitive tests (sources attached). I do
see that non-bool bit fields are consistently slower though ("foo:0"
below means type "foo" without bit fields; bool tests are repeated to
show result variance):
host1:
> bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.206s
> bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.203s
> uint32_t:0 size: 20 final: -1085972333.443588956 iterations: 100M time:
> 1.191s
> uint32_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.525s
> uint8_t:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.204s
> uint8_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.527s
> bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.204s
> bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.204s
host2:
> bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.851s
> bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.848s
> uint32_t:0 size: 20 final: -1085972333.443588956 iterations: 100M time:
> 0.863s
> uint32_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.150s
> uint8_t:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.849s
> uint8_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.150s
> bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.848s
> bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.849s
host3:
> bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.615s
> bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.615s
> uint32_t:0 size: 20 final: -1085972333.443588956 iterations: 100M time:
> 0.696s
> uint32_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.928s
> uint8_t:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.615s
> uint8_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.928s
> bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.615s
> bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 0.615s
With g++ -00, boolean bit fields become slower than plain bool as well:
> bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.347s
> bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 2.023s
> uint32_t:0 size: 20 final: -1085972333.443588956 iterations: 100M time:
> 1.448s
> uint32_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 2.002s
> uint8_t:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.371s
> uint8_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 2.034s
> bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 1.348s
> bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 2.095s
The same is actually true for -O3 with an older g++ v3.4.6 on a
different host:
> bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 4.468s
> bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 6.238s
> uint32_t:0 size: 20 final: -1085972333.443588956 iterations: 100M time:
> 4.876s
> uint32_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 6.209s
> uint8_t:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 4.470s
> uint8_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 6.208s
> bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 4.471s
> bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time:
> 6.231s
To me, it looks like bit fields in general may hurt performance where
memory composition is not important (as expected, I guess), and that
some compilers remove any difference between full and bit boolean with
-O3 (that surprised me).
G++ assembly source comparison seem to confirm that -- boolean-based
full and bit assembly sources are virtually identical with -O3 and newer
g++ versions, while bit fields show a lot more assembly operations with
-O0 (both diffs attached). Assembly is well beyond my expertise though.
Am I testing this wrong or is it a case of YMMV? If it is "YMMV", should
we err on the side of simplicity and use simple bool where memory
savings are not important or not existent?
Thank you,
Alex.
--- ./bf-bool0.asm 2013-01-23 23:29:08.000000000 -0700
+++ ./bf-bool1.asm 2013-01-23 23:28:58.000000000 -0700
@@ -187,61 +187,61 @@
movl -28(%ebp), %edx
subl -72(%ebp), %eax
movl $3, _ZSt4cout+8
subl $4, %esp
subl -68(%ebp), %edx
jns .L14
addl $1000000000, %edx
subl $1, %eax
.L14:
movl %eax, -40(%ebp)
movl _ZSt4cout, %eax
fildl -40(%ebp)
fstpl -56(%ebp)
movl %edx, -40(%ebp)
fildl -40(%ebp)
fdivs .LC0
movl -12(%eax), %edx
movl $.LC2, 4(%esp)
movl $_ZSt4cout, (%esp)
movl _ZSt4cout+12(%edx), %eax
movl $10, _ZSt4cout+8(%edx)
andl $-261, %eax
orl $4, %eax
movl %eax, _ZSt4cout+12(%edx)
faddl -56(%ebp)
fstpl -56(%ebp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl $58, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
- movl $0, 4(%esp)
+ movl $1, 4(%esp)
movl %eax, (%esp)
call _ZNSolsEi
movl $.LC3, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl $12, 4(%esp)
movl %eax, (%esp)
call _ZNSo9_M_insertImEERSoT_
movl $.LC4, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl %edi, 4(%esp)
movl %eax, (%esp)
call _ZNSolsEi
movl $46, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
movl -60(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call _ZNSolsEi
movl $.LC5, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
fildl -64(%ebp)
fdivs .LC6
flds .LC7
fxch %st(1)
fucomi %st(1), %st
jae .L15
--- ./bf-bool0.asm 2013-01-23 23:25:19.000000000 -0700
+++ ./bf-bool1.asm 2013-01-23 23:26:53.000000000 -0700
@@ -339,156 +339,182 @@
.text
.globl main
.type main, @function
main:
.LFB986:
.cfi_startproc
leal 4(%esp), %ecx
.cfi_def_cfa 1, 0
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
.cfi_escape 0x10,0x5,0x2,0x75,0
pushl %edi
pushl %esi
pushl %ebx
pushl %ecx
.cfi_escape 0xf,0x3,0x75,0x70,0x6
.cfi_escape 0x10,0x3,0x2,0x75,0x74
.cfi_escape 0x10,0x6,0x2,0x75,0x78
.cfi_escape 0x10,0x7,0x2,0x75,0x7c
subl $104, %esp
movl %ecx, %eax
movl 4(%eax), %eax
addl $4, %eax
movl (%eax), %eax
movl %eax, (%esp)
call atoi
movl %eax, -28(%ebp)
movl $1, -64(%ebp)
- movb $0, -60(%ebp)
- movb $0, -59(%ebp)
- movb $0, -58(%ebp)
+ movzbl -60(%ebp), %eax
+ andl $-2, %eax
+ movb %al, -60(%ebp)
+ movzbl -60(%ebp), %eax
+ andl $-3, %eax
+ movb %al, -60(%ebp)
+ movzbl -60(%ebp), %eax
+ andl $-5, %eax
+ movb %al, -60(%ebp)
movl $0, -56(%ebp)
leal -52(%ebp), %eax
movl %eax, (%esp)
call _Z3nowv
subl $4, %esp
movl $0, -32(%ebp)
jmp .L17
.L22:
movzbl -60(%ebp), %eax
+ andl $1, %eax
testb %al, %al
je .L18
movl -64(%ebp), %edx
movl -56(%ebp), %eax
addl %edx, %eax
movl %eax, -64(%ebp)
jmp .L19
.L18:
- movzbl -59(%ebp), %eax
+ movzbl -60(%ebp), %eax
+ shrb %al
+ andl $1, %eax
testb %al, %al
je .L20
movl -64(%ebp), %edx
movl -56(%ebp), %eax
movl %edx, %ecx
subl %eax, %ecx
movl %ecx, %eax
movl %eax, -64(%ebp)
jmp .L19
.L20:
- movzbl -58(%ebp), %eax
+ movzbl -60(%ebp), %eax
+ shrb $2, %al
+ andl $1, %eax
testb %al, %al
je .L21
movl -56(%ebp), %edx
movl -64(%ebp), %eax
movl %edx, %ecx
subl %eax, %ecx
movl %ecx, %eax
movl %eax, -56(%ebp)
jmp .L19
.L21:
movl -64(%ebp), %edx
movl -56(%ebp), %eax
xorl %edx, %eax
movl %eax, -56(%ebp)
.L19:
movl -32(%ebp), %ecx
movl $1431655766, %edx
movl %ecx, %eax
imull %edx
movl %ecx, %eax
sarl $31, %eax
subl %eax, %edx
movl %edx, %eax
addl %eax, %eax
addl %edx, %eax
movl %ecx, %edx
subl %eax, %edx
testl %edx, %edx
setne %al
+ movl %eax, %edx
+ andl $1, %edx
+ movzbl -60(%ebp), %eax
+ andl $-2, %eax
+ orl %edx, %eax
movb %al, -60(%ebp)
movl -32(%ebp), %ecx
movl $1717986919, %edx
movl %ecx, %eax
imull %edx
sarl %edx
movl %ecx, %eax
sarl $31, %eax
subl %eax, %edx
movl %edx, %eax
sall $2, %eax
addl %edx, %eax
movl %ecx, %edx
subl %eax, %edx
testl %edx, %edx
setne %al
- movb %al, -59(%ebp)
+ andl $1, %eax
+ leal (%eax,%eax), %edx
+ movzbl -60(%ebp), %eax
+ andl $-3, %eax
+ orl %edx, %eax
+ movb %al, -60(%ebp)
movl -32(%ebp), %ecx
movl $-1840700269, %edx
movl %ecx, %eax
imull %edx
leal (%edx,%ecx), %eax
movl %eax, %edx
sarl $2, %edx
movl %ecx, %eax
sarl $31, %eax
subl %eax, %edx
movl %edx, %eax
sall $3, %eax
subl %edx, %eax
movl %ecx, %edx
subl %eax, %edx
testl %edx, %edx
setne %al
- movb %al, -58(%ebp)
+ andl $1, %eax
+ leal 0(,%eax,4), %edx
+ movzbl -60(%ebp), %eax
+ andl $-5, %eax
+ orl %edx, %eax
+ movb %al, -60(%ebp)
addl $1, -32(%ebp)
.L17:
movl -32(%ebp), %eax
cmpl -28(%ebp), %eax
setl %al
testb %al, %al
jne .L22
leal -44(%ebp), %eax
movl %eax, (%esp)
call _Z3nowv
subl $4, %esp
movl $3, 4(%esp)
movl $_ZSt4cout+4, (%esp)
call _ZNSt8ios_base9precisionEi
movl -52(%ebp), %eax
movl -48(%ebp), %edx
movl %eax, 8(%esp)
movl %edx, 12(%esp)
movl -44(%ebp), %eax
movl -40(%ebp), %edx
movl %eax, (%esp)
movl %edx, 4(%esp)
call _Zmi8timespecS_
fstpl -96(%ebp)
fildl -28(%ebp)
fldl .LC2
fdivrp %st, %st(1)
fldl .LC3
fxch %st(1)
fucomi %st(1), %st
@@ -519,61 +545,61 @@
movl %ebx, %eax
xorb $0, %ah
movl %eax, -104(%ebp)
movl %esi, %edx
xorl $-2147483648, %edx
movl %edx, -100(%ebp)
movl -104(%ebp), %ebx
movl -100(%ebp), %esi
.L24:
movl -56(%ebp), %edi
movl -64(%ebp), %ecx
movl %ecx, -80(%ebp)
leal -36(%ebp), %eax
movl $10, 4(%esp)
movl %eax, (%esp)
call _ZSt4setwi
subl $4, %esp
movl $_ZSt5fixedRSt8ios_base, 4(%esp)
movl $_ZSt4cout, (%esp)
call _ZNSolsEPFRSt8ios_baseS0_E
movl -36(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call _ZStlsIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_St5_Setw
movl $.LC4, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl $58, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
- movl $0, 4(%esp)
+ movl $1, 4(%esp)
movl %eax, (%esp)
call _ZNSolsEi
movl $.LC5, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl $12, 4(%esp)
movl %eax, (%esp)
call _ZNSolsEj
movl $.LC6, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl -80(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call _ZNSolsEi
movl $46, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
movl %edi, 4(%esp)
movl %eax, (%esp)
call _ZNSolsEi
movl $.LC7, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl %ebx, 4(%esp)
movl %esi, 8(%esp)
movl %eax, (%esp)
call _ZNSolsEy
movl $77, 4(%esp)
movl %eax, (%esp)
test: bf-bool0 bf-int0 bf-char0
@./bf-bool0 100000000
@./bf-bool1 100000000
@./bf-char0 100000000
@./bf-char1 100000000
@./bf-int0 100000000
@./bf-int1 100000000
@./bf-bool0 100000000
@./bf-bool1 100000000
bf-bool0: bitfields.cc Makefile
g++ -DTYPE=bool -DBITS=0 -o ./bf-bool0 -O0 -Wall bitfields.cc -lrt
g++ -DTYPE=bool -DBITS=1 -o ./bf-bool1 -O0 -Wall bitfields.cc -lrt
bf-int0: bitfields.cc Makefile
g++ -DTYPE='uint8_t' -DBITS=0 -o ./bf-int0 -O0 -Wall bitfields.cc -lrt
g++ -DTYPE='uint8_t' -DBITS=1 -o ./bf-int1 -O0 -Wall bitfields.cc -lrt
bf-char0: bitfields.cc Makefile
g++ -DTYPE=uint32_t -DBITS=0 -o ./bf-char0 -O0 -Wall bitfields.cc -lrt
g++ -DTYPE=uint32_t -DBITS=1 -o ./bf-char1 -O0 -Wall bitfields.cc -lrt
#include <stdint.h>
#include <iostream>
#include <iomanip>
#include <time.h>
#include <stdlib.h>
#define STR_EXPAND(tok) #tok
#define STR(tok) STR_EXPAND(tok)
struct Test1 {
int a;
#if BITS
TYPE b0: BITS;
TYPE b1: BITS;
TYPE b2: BITS;
#else
TYPE b0;
TYPE b1;
TYPE b2;
#endif
int c;
};
timespec now()
{
timespec t = { 0, 0 };
if (clock_gettime(CLOCK_MONOTONIC, &t) == 0)
return t;
abort();
}
double operator -(timespec t1, timespec t2) {
t1.tv_sec -= t2.tv_sec;
t1.tv_nsec -= t2.tv_nsec;
if (t1.tv_nsec < 0) {
t1.tv_nsec += 1000000000L;
t1.tv_sec--;
}
return t1.tv_sec + t1.tv_nsec / 1e9;
}
int main(int argc, char *argv[])
{
const int count = atoi(argv[1]);
Test1 test1 = { 1, false, false, false, 0 };
const timespec start = now();
for (int i = 0; i < count; ++i) {
if (test1.b0)
test1.a = test1.a + test1.c;
else
if (test1.b1)
test1.a = test1.a - test1.c;
else
if (test1.b2)
test1.c = test1.c - test1.a;
else
test1.c = test1.a ^ test1.c;
test1.b0 = (i % 3) != 0;
test1.b1 = (i % 5) != 0;
test1.b2 = (i % 7) != 0;
}
const timespec end = now();
std::cout.precision(3);
std::cout << std::fixed << std::setw(10) <<
STR(TYPE) << ':' << BITS <<
" size: " << sizeof(test1) <<
" final: " << test1.a << '.' << test1.c <<
" iterations: " << static_cast<uint64_t>(count/1e6) << 'M' <<
" time: " << (end-start) << "s" <<
"\n";
return 0;
}