On 01/23/2013 07:05 PM, Amos Jeffries wrote:
> On 24/01/2013 7:20 a.m., Kinkie wrote:

>>    the attached patch turns the unsigned int:1 flags in CachePeer to
>> bools.


> Please retain the :1 bitmasking. My microbench is showing a consistent
> ~50ms speed gain on bitmasks over full bool, particularly when there are
> multiple bools in the structure. We also get some useful object size gains.

Hello,

    FYI: With g++ -O3, there is no measureable performance difference
between bool and bool:1 in my primitive tests (sources attached). I do
see that non-bool bit fields are consistently slower though ("foo:0"
below means type "foo" without bit fields; bool tests are repeated to
show result variance):


host1:
>       bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.206s
>       bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.203s
>   uint32_t:0 size: 20 final: -1085972333.443588956 iterations: 100M time: 
> 1.191s
>   uint32_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.525s
>    uint8_t:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.204s
>    uint8_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.527s
>       bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.204s
>       bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.204s

host2:
>       bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.851s
>       bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.848s
>   uint32_t:0 size: 20 final: -1085972333.443588956 iterations: 100M time: 
> 0.863s
>   uint32_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.150s
>    uint8_t:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.849s
>    uint8_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.150s
>       bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.848s
>       bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.849s

host3:
>       bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.615s
>       bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.615s
>   uint32_t:0 size: 20 final: -1085972333.443588956 iterations: 100M time: 
> 0.696s
>   uint32_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.928s
>    uint8_t:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.615s
>    uint8_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.928s
>       bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.615s
>       bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 0.615s


With g++ -00, boolean bit fields become slower than plain bool as well:

>       bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.347s
>       bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 2.023s
>   uint32_t:0 size: 20 final: -1085972333.443588956 iterations: 100M time: 
> 1.448s
>   uint32_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 2.002s
>    uint8_t:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.371s
>    uint8_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 2.034s
>       bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 1.348s
>       bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 2.095s


The same is actually true for -O3 with an older g++ v3.4.6 on a
different host:

>       bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 4.468s
>       bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 6.238s
>   uint32_t:0 size: 20 final: -1085972333.443588956 iterations: 100M time: 
> 4.876s
>   uint32_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 6.209s
>    uint8_t:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 4.470s
>    uint8_t:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 6.208s
>       bool:0 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 4.471s
>       bool:1 size: 12 final: -1085972333.443588956 iterations: 100M time: 
> 6.231s



To me, it looks like bit fields in general may hurt performance where
memory composition is not important (as expected, I guess), and that
some compilers remove any difference between full and bit boolean with
-O3 (that surprised me).

G++ assembly source comparison seem to confirm that -- boolean-based
full and bit assembly sources are virtually identical with -O3 and newer
g++ versions, while bit fields show a lot more assembly operations with
-O0 (both diffs attached). Assembly is well beyond my expertise though.


Am I testing this wrong or is it a case of YMMV? If it is "YMMV", should
we err on the side of simplicity and use simple bool where memory
savings are not important or not existent?


Thank you,

Alex.

--- ./bf-bool0.asm	2013-01-23 23:29:08.000000000 -0700
+++ ./bf-bool1.asm	2013-01-23 23:28:58.000000000 -0700
@@ -187,61 +187,61 @@
 	movl	-28(%ebp), %edx
 	subl	-72(%ebp), %eax
 	movl	$3, _ZSt4cout+8
 	subl	$4, %esp
 	subl	-68(%ebp), %edx
 	jns	.L14
 	addl	$1000000000, %edx
 	subl	$1, %eax
 .L14:
 	movl	%eax, -40(%ebp)
 	movl	_ZSt4cout, %eax
 	fildl	-40(%ebp)
 	fstpl	-56(%ebp)
 	movl	%edx, -40(%ebp)
 	fildl	-40(%ebp)
 	fdivs	.LC0
 	movl	-12(%eax), %edx
 	movl	$.LC2, 4(%esp)
 	movl	$_ZSt4cout, (%esp)
 	movl	_ZSt4cout+12(%edx), %eax
 	movl	$10, _ZSt4cout+8(%edx)
 	andl	$-261, %eax
 	orl	$4, %eax
 	movl	%eax, _ZSt4cout+12(%edx)
 	faddl	-56(%ebp)
 	fstpl	-56(%ebp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	movl	$58, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
-	movl	$0, 4(%esp)
+	movl	$1, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZNSolsEi
 	movl	$.LC3, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	movl	$12, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZNSo9_M_insertImEERSoT_
 	movl	$.LC4, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	movl	%edi, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZNSolsEi
 	movl	$46, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
 	movl	-60(%ebp), %edx
 	movl	%edx, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZNSolsEi
 	movl	$.LC5, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	fildl	-64(%ebp)
 	fdivs	.LC6
 	flds	.LC7
 	fxch	%st(1)
 	fucomi	%st(1), %st
 	jae	.L15
--- ./bf-bool0.asm	2013-01-23 23:25:19.000000000 -0700
+++ ./bf-bool1.asm	2013-01-23 23:26:53.000000000 -0700
@@ -339,156 +339,182 @@
 	.text
 	.globl	main
 	.type	main, @function
 main:
 .LFB986:
 	.cfi_startproc
 	leal	4(%esp), %ecx
 	.cfi_def_cfa 1, 0
 	andl	$-16, %esp
 	pushl	-4(%ecx)
 	pushl	%ebp
 	movl	%esp, %ebp
 	.cfi_escape 0x10,0x5,0x2,0x75,0
 	pushl	%edi
 	pushl	%esi
 	pushl	%ebx
 	pushl	%ecx
 	.cfi_escape 0xf,0x3,0x75,0x70,0x6
 	.cfi_escape 0x10,0x3,0x2,0x75,0x74
 	.cfi_escape 0x10,0x6,0x2,0x75,0x78
 	.cfi_escape 0x10,0x7,0x2,0x75,0x7c
 	subl	$104, %esp
 	movl	%ecx, %eax
 	movl	4(%eax), %eax
 	addl	$4, %eax
 	movl	(%eax), %eax
 	movl	%eax, (%esp)
 	call	atoi
 	movl	%eax, -28(%ebp)
 	movl	$1, -64(%ebp)
-	movb	$0, -60(%ebp)
-	movb	$0, -59(%ebp)
-	movb	$0, -58(%ebp)
+	movzbl	-60(%ebp), %eax
+	andl	$-2, %eax
+	movb	%al, -60(%ebp)
+	movzbl	-60(%ebp), %eax
+	andl	$-3, %eax
+	movb	%al, -60(%ebp)
+	movzbl	-60(%ebp), %eax
+	andl	$-5, %eax
+	movb	%al, -60(%ebp)
 	movl	$0, -56(%ebp)
 	leal	-52(%ebp), %eax
 	movl	%eax, (%esp)
 	call	_Z3nowv
 	subl	$4, %esp
 	movl	$0, -32(%ebp)
 	jmp	.L17
 .L22:
 	movzbl	-60(%ebp), %eax
+	andl	$1, %eax
 	testb	%al, %al
 	je	.L18
 	movl	-64(%ebp), %edx
 	movl	-56(%ebp), %eax
 	addl	%edx, %eax
 	movl	%eax, -64(%ebp)
 	jmp	.L19
 .L18:
-	movzbl	-59(%ebp), %eax
+	movzbl	-60(%ebp), %eax
+	shrb	%al
+	andl	$1, %eax
 	testb	%al, %al
 	je	.L20
 	movl	-64(%ebp), %edx
 	movl	-56(%ebp), %eax
 	movl	%edx, %ecx
 	subl	%eax, %ecx
 	movl	%ecx, %eax
 	movl	%eax, -64(%ebp)
 	jmp	.L19
 .L20:
-	movzbl	-58(%ebp), %eax
+	movzbl	-60(%ebp), %eax
+	shrb	$2, %al
+	andl	$1, %eax
 	testb	%al, %al
 	je	.L21
 	movl	-56(%ebp), %edx
 	movl	-64(%ebp), %eax
 	movl	%edx, %ecx
 	subl	%eax, %ecx
 	movl	%ecx, %eax
 	movl	%eax, -56(%ebp)
 	jmp	.L19
 .L21:
 	movl	-64(%ebp), %edx
 	movl	-56(%ebp), %eax
 	xorl	%edx, %eax
 	movl	%eax, -56(%ebp)
 .L19:
 	movl	-32(%ebp), %ecx
 	movl	$1431655766, %edx
 	movl	%ecx, %eax
 	imull	%edx
 	movl	%ecx, %eax
 	sarl	$31, %eax
 	subl	%eax, %edx
 	movl	%edx, %eax
 	addl	%eax, %eax
 	addl	%edx, %eax
 	movl	%ecx, %edx
 	subl	%eax, %edx
 	testl	%edx, %edx
 	setne	%al
+	movl	%eax, %edx
+	andl	$1, %edx
+	movzbl	-60(%ebp), %eax
+	andl	$-2, %eax
+	orl	%edx, %eax
 	movb	%al, -60(%ebp)
 	movl	-32(%ebp), %ecx
 	movl	$1717986919, %edx
 	movl	%ecx, %eax
 	imull	%edx
 	sarl	%edx
 	movl	%ecx, %eax
 	sarl	$31, %eax
 	subl	%eax, %edx
 	movl	%edx, %eax
 	sall	$2, %eax
 	addl	%edx, %eax
 	movl	%ecx, %edx
 	subl	%eax, %edx
 	testl	%edx, %edx
 	setne	%al
-	movb	%al, -59(%ebp)
+	andl	$1, %eax
+	leal	(%eax,%eax), %edx
+	movzbl	-60(%ebp), %eax
+	andl	$-3, %eax
+	orl	%edx, %eax
+	movb	%al, -60(%ebp)
 	movl	-32(%ebp), %ecx
 	movl	$-1840700269, %edx
 	movl	%ecx, %eax
 	imull	%edx
 	leal	(%edx,%ecx), %eax
 	movl	%eax, %edx
 	sarl	$2, %edx
 	movl	%ecx, %eax
 	sarl	$31, %eax
 	subl	%eax, %edx
 	movl	%edx, %eax
 	sall	$3, %eax
 	subl	%edx, %eax
 	movl	%ecx, %edx
 	subl	%eax, %edx
 	testl	%edx, %edx
 	setne	%al
-	movb	%al, -58(%ebp)
+	andl	$1, %eax
+	leal	0(,%eax,4), %edx
+	movzbl	-60(%ebp), %eax
+	andl	$-5, %eax
+	orl	%edx, %eax
+	movb	%al, -60(%ebp)
 	addl	$1, -32(%ebp)
 .L17:
 	movl	-32(%ebp), %eax
 	cmpl	-28(%ebp), %eax
 	setl	%al
 	testb	%al, %al
 	jne	.L22
 	leal	-44(%ebp), %eax
 	movl	%eax, (%esp)
 	call	_Z3nowv
 	subl	$4, %esp
 	movl	$3, 4(%esp)
 	movl	$_ZSt4cout+4, (%esp)
 	call	_ZNSt8ios_base9precisionEi
 	movl	-52(%ebp), %eax
 	movl	-48(%ebp), %edx
 	movl	%eax, 8(%esp)
 	movl	%edx, 12(%esp)
 	movl	-44(%ebp), %eax
 	movl	-40(%ebp), %edx
 	movl	%eax, (%esp)
 	movl	%edx, 4(%esp)
 	call	_Zmi8timespecS_
 	fstpl	-96(%ebp)
 	fildl	-28(%ebp)
 	fldl	.LC2
 	fdivrp	%st, %st(1)
 	fldl	.LC3
 	fxch	%st(1)
 	fucomi	%st(1), %st
@@ -519,61 +545,61 @@
 	movl	%ebx, %eax
 	xorb	$0, %ah
 	movl	%eax, -104(%ebp)
 	movl	%esi, %edx
 	xorl	$-2147483648, %edx
 	movl	%edx, -100(%ebp)
 	movl	-104(%ebp), %ebx
 	movl	-100(%ebp), %esi
 .L24:
 	movl	-56(%ebp), %edi
 	movl	-64(%ebp), %ecx
 	movl	%ecx, -80(%ebp)
 	leal	-36(%ebp), %eax
 	movl	$10, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZSt4setwi
 	subl	$4, %esp
 	movl	$_ZSt5fixedRSt8ios_base, 4(%esp)
 	movl	$_ZSt4cout, (%esp)
 	call	_ZNSolsEPFRSt8ios_baseS0_E
 	movl	-36(%ebp), %edx
 	movl	%edx, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_St5_Setw
 	movl	$.LC4, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	movl	$58, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
-	movl	$0, 4(%esp)
+	movl	$1, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZNSolsEi
 	movl	$.LC5, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	movl	$12, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZNSolsEj
 	movl	$.LC6, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	movl	-80(%ebp), %edx
 	movl	%edx, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZNSolsEi
 	movl	$46, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
 	movl	%edi, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZNSolsEi
 	movl	$.LC7, 4(%esp)
 	movl	%eax, (%esp)
 	call	_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	movl	%ebx, 4(%esp)
 	movl	%esi, 8(%esp)
 	movl	%eax, (%esp)
 	call	_ZNSolsEy
 	movl	$77, 4(%esp)
 	movl	%eax, (%esp)
test: bf-bool0 bf-int0 bf-char0
        @./bf-bool0 100000000
        @./bf-bool1 100000000
        @./bf-char0 100000000
        @./bf-char1 100000000
        @./bf-int0 100000000
        @./bf-int1 100000000
        @./bf-bool0 100000000
        @./bf-bool1 100000000

bf-bool0: bitfields.cc Makefile
        g++ -DTYPE=bool -DBITS=0 -o ./bf-bool0 -O0 -Wall bitfields.cc -lrt
        g++ -DTYPE=bool -DBITS=1 -o ./bf-bool1 -O0 -Wall bitfields.cc -lrt

bf-int0: bitfields.cc Makefile
        g++ -DTYPE='uint8_t' -DBITS=0 -o ./bf-int0 -O0 -Wall bitfields.cc -lrt
        g++ -DTYPE='uint8_t' -DBITS=1 -o ./bf-int1 -O0 -Wall bitfields.cc -lrt

bf-char0: bitfields.cc Makefile
        g++ -DTYPE=uint32_t -DBITS=0 -o ./bf-char0 -O0 -Wall bitfields.cc -lrt
        g++ -DTYPE=uint32_t -DBITS=1 -o ./bf-char1 -O0 -Wall bitfields.cc -lrt

#include <stdint.h>
#include <iostream>
#include <iomanip>
#include <time.h>
#include <stdlib.h>

#define STR_EXPAND(tok) #tok
#define STR(tok) STR_EXPAND(tok)

struct Test1 {
	int a;
#if BITS
	TYPE b0: BITS;
	TYPE b1: BITS;
	TYPE b2: BITS;
#else
	TYPE b0;
	TYPE b1;
	TYPE b2;
#endif
	int c;
};

timespec now()
{
	timespec t = { 0, 0 };
	if (clock_gettime(CLOCK_MONOTONIC, &t) == 0)
		return t;
	abort();
}

double operator -(timespec t1, timespec t2) {
	t1.tv_sec -= t2.tv_sec;
	t1.tv_nsec -= t2.tv_nsec;
	if (t1.tv_nsec < 0) {
		t1.tv_nsec += 1000000000L;
		t1.tv_sec--;
	}

	return t1.tv_sec + t1.tv_nsec / 1e9;
}

int main(int argc, char *argv[])
{
    const int count = atoi(argv[1]);

	Test1 test1 = { 1, false, false, false, 0 };

	const timespec start = now();
    for (int i = 0; i < count; ++i) {
		if (test1.b0)
			test1.a = test1.a + test1.c;
		else
		if (test1.b1)
			test1.a = test1.a - test1.c;
		else
		if (test1.b2)
			test1.c = test1.c - test1.a;
		else
			test1.c = test1.a ^ test1.c;
		test1.b0 = (i % 3) != 0;
		test1.b1 = (i % 5) != 0;
		test1.b2 = (i % 7) != 0;
	}
	const timespec end = now();

	std::cout.precision(3);
	std::cout << std::fixed << std::setw(10) <<
		STR(TYPE) << ':' << BITS <<
		" size: " << sizeof(test1) <<
		" final: " << test1.a << '.' << test1.c <<
		" iterations: " << static_cast<uint64_t>(count/1e6) << 'M' <<
		" time: " << (end-start) << "s" <<
		"\n";

    return 0;
}

Reply via email to