Forwarding to the list. Duh.
-------- Original Message -------- Subject: Re: Fwd: Re: STDCXX-1071 numpunct facet defect Date: Sun, 30 Sep 2012 19:02:27 -0400 From: Liviu Nicoara <nikko...@hates.ms> To: Martin Sebor <mse...@gmail.com> On 9/30/12 6:18 PM, Martin Sebor wrote:
I see you did a 64-bit build while I did a 32-bit one. so I tried 64-bits. The cached version (i.e., the one compiled with -UNO_USE_NUMPUNCT_CACHE) is still about twice as fast as the non-cached one (compiled with -DNO_USE_NUMPUNCT_CACHE). I had made one change to the test program that I thought might account for the difference: I removed the call to abort from the thread function since it was causing the process to exit prematurely in some of my tests. But since you used the modified program for your latest measurements that couldn't be it. I can't explain the differences. They just don't make sense to me. Your results should be the other way around. Can you post the disassembly of function f() for each of the two configurations of the test?
Here they are. Liviu
Dump of assembler code for function f: 0x0000000000403870 <+0>: push %r15 0x0000000000403872 <+2>: push %r14 0x0000000000403874 <+4>: push %r13 0x0000000000403876 <+6>: push %r12 0x0000000000403878 <+8>: push %rbp 0x0000000000403879 <+9>: push %rbx 0x000000000040387a <+10>: mov %rdi,%rbx 0x000000000040387d <+13>: sub $0x38,%rsp 0x0000000000403881 <+17>: nopl 0x0(%rax) 0x0000000000403888 <+24>: movzbl 0x261f11(%rip),%eax # 0x6657a0 <_ZL5pwait> 0x000000000040388f <+31>: test %al,%al 0x0000000000403891 <+33>: jne 0x403888 <f+24> 0x0000000000403893 <+35>: cmpq $0x0,0x261ef5(%rip) # 0x665790 <_ZL6nloops> 0x000000000040389b <+43>: jle 0x403b12 <f+674> 0x00000000004038a1 <+49>: xor %ebp,%ebp 0x00000000004038a3 <+51>: xor %r12d,%r12d 0x00000000004038a6 <+54>: lea 0x10(%rsp),%r13 0x00000000004038ab <+59>: lea 0x48(%rbx),%r14 0x00000000004038af <+63>: lea 0x20(%rsp),%r15 0x00000000004038b4 <+68>: jmpq 0x4039a7 <f+311> 0x00000000004038b9 <+73>: nopl 0x0(%rax) 0x00000000004038c0 <+80>: cmp $0x66a020,%rdx 0x00000000004038c7 <+87>: mov %rdi,0x20(%rsp) 0x00000000004038cc <+92>: je 0x403ab8 <f+584> 0x00000000004038d2 <+98>: mov %rdx,%rdi 0x00000000004038d5 <+101>: mov %rdx,(%rsp) 0x00000000004038d9 <+105>: callq 0x403658 <pthread_mutex_lock@plt> 0x00000000004038de <+110>: test %eax,%eax 0x00000000004038e0 <+112>: mov (%rsp),%rdx 0x00000000004038e4 <+116>: je 0x4038fb <f+139> 0x00000000004038e6 <+118>: mov $0x4452a4,%esi 0x00000000004038eb <+123>: mov $0xa,%edi 0x00000000004038f0 <+128>: xor %eax,%eax 0x00000000004038f2 <+130>: callq 0x404370 <_ZN4__rw10__rw_throwEiz> 0x00000000004038f7 <+135>: mov (%rsp),%rdx 0x00000000004038fb <+139>: addl $0x1,0x28(%rdx) 0x00000000004038ff <+143>: test %rdx,%rdx 0x0000000000403902 <+146>: je 0x40390c <f+156> 0x0000000000403904 <+148>: mov %rdx,%rdi 0x0000000000403907 <+151>: callq 0x4036c8 <pthread_mutex_unlock@plt> 0x000000000040390c <+156>: mov 0x20(%rsp),%rdx 0x0000000000403911 <+161>: mov %rdx,%rdi 0x0000000000403914 <+164>: mov %rdx,(%rsp) 0x0000000000403918 <+168>: callq 0x403258 <strlen@plt> 0x000000000040391d <+173>: mov (%rsp),%rdx 0x0000000000403921 <+177>: add %rax,%r12 0x0000000000403924 <+180>: lea -0x40(%rdx),%rcx 0x0000000000403928 <+184>: cmp $0x66a020,%rcx 0x000000000040392f <+191>: je 0x40398b <f+283> 0x0000000000403931 <+193>: mov %rcx,%rdi 0x0000000000403934 <+196>: mov %rcx,0x8(%rsp) 0x0000000000403939 <+201>: callq 0x403658 <pthread_mutex_lock@plt> 0x000000000040393e <+206>: test %eax,%eax 0x0000000000403940 <+208>: mov (%rsp),%rdx 0x0000000000403944 <+212>: mov 0x8(%rsp),%rcx 0x0000000000403949 <+217>: je 0x403965 <f+245> 0x000000000040394b <+219>: mov $0x4452a4,%esi 0x0000000000403950 <+224>: mov $0xa,%edi 0x0000000000403955 <+229>: xor %eax,%eax 0x0000000000403957 <+231>: callq 0x404370 <_ZN4__rw10__rw_throwEiz> 0x000000000040395c <+236>: mov 0x8(%rsp),%rcx 0x0000000000403961 <+241>: mov (%rsp),%rdx 0x0000000000403965 <+245>: mov -0x18(%rdx),%esi 0x0000000000403968 <+248>: test %rcx,%rcx 0x000000000040396b <+251>: lea -0x1(%rsi),%eax 0x000000000040396e <+254>: mov %eax,-0x18(%rdx) 0x0000000000403971 <+257>: je 0x403983 <f+275> 0x0000000000403973 <+259>: mov %rcx,%rdi 0x0000000000403976 <+262>: mov %esi,0x8(%rsp) 0x000000000040397a <+266>: callq 0x4036c8 <pthread_mutex_unlock@plt> 0x000000000040397f <+271>: mov 0x8(%rsp),%esi 0x0000000000403983 <+275>: test %esi,%esi 0x0000000000403985 <+277>: jle 0x403a80 <f+528> 0x000000000040398b <+283>: add $0x1,%ebp 0x000000000040398e <+286>: movq $0x0,0x20(%rsp) 0x0000000000403997 <+295>: movslq %ebp,%rax 0x000000000040399a <+298>: cmp 0x261def(%rip),%rax # 0x665790 <_ZL6nloops> 0x00000000004039a1 <+305>: jge 0x403b00 <f+656> 0x00000000004039a7 <+311>: mov 0x40(%rbx),%eax 0x00000000004039aa <+314>: test $0x1,%al 0x00000000004039ac <+316>: jne 0x403a38 <f+456> 0x00000000004039b2 <+322>: or $0x1,%eax 0x00000000004039b5 <+325>: mov %rbx,%rsi 0x00000000004039b8 <+328>: mov %r13,%rdi 0x00000000004039bb <+331>: mov %eax,0x40(%rbx) 0x00000000004039be <+334>: mov (%rbx),%rax 0x00000000004039c1 <+337>: callq *(%rax) 0x00000000004039c3 <+339>: mov %r13,%rsi 0x00000000004039c6 <+342>: mov %r14,%rdi 0x00000000004039c9 <+345>: callq 0x410830 <_ZNSsaSERKSs> 0x00000000004039ce <+350>: mov 0x10(%rsp),%rdx 0x00000000004039d3 <+355>: sub $0x40,%rdx 0x00000000004039d7 <+359>: cmp $0x66a020,%rdx 0x00000000004039de <+366>: je 0x403a2f <f+447> 0x00000000004039e0 <+368>: mov %rdx,%rdi 0x00000000004039e3 <+371>: mov %rdx,(%rsp) 0x00000000004039e7 <+375>: callq 0x403658 <pthread_mutex_lock@plt> 0x00000000004039ec <+380>: test %eax,%eax 0x00000000004039ee <+382>: mov (%rsp),%rdx 0x00000000004039f2 <+386>: je 0x403a09 <f+409> 0x00000000004039f4 <+388>: mov $0x4452a4,%esi 0x00000000004039f9 <+393>: mov $0xa,%edi 0x00000000004039fe <+398>: xor %eax,%eax 0x0000000000403a00 <+400>: callq 0x404370 <_ZN4__rw10__rw_throwEiz> 0x0000000000403a05 <+405>: mov (%rsp),%rdx 0x0000000000403a09 <+409>: mov 0x28(%rdx),%ecx 0x0000000000403a0c <+412>: test %rdx,%rdx 0x0000000000403a0f <+415>: lea -0x1(%rcx),%eax 0x0000000000403a12 <+418>: mov %eax,0x28(%rdx) 0x0000000000403a15 <+421>: je 0x403a27 <f+439> 0x0000000000403a17 <+423>: mov %rdx,%rdi 0x0000000000403a1a <+426>: mov %ecx,0x8(%rsp) 0x0000000000403a1e <+430>: callq 0x4036c8 <pthread_mutex_unlock@plt> 0x0000000000403a23 <+435>: mov 0x8(%rsp),%ecx 0x0000000000403a27 <+439>: test %ecx,%ecx 0x0000000000403a29 <+441>: jle 0x403ac8 <f+600> 0x0000000000403a2f <+447>: movq $0x0,0x10(%rsp) 0x0000000000403a38 <+456>: mov 0x48(%rbx),%rdi 0x0000000000403a3c <+460>: cmpl $0xffffffffffffffff,-0x18(%rdi) 0x0000000000403a40 <+464>: lea -0x40(%rdi),%rdx 0x0000000000403a44 <+468>: jne 0x4038c0 <f+80> 0x0000000000403a4a <+474>: mov -0x8(%rdi),%rcx 0x0000000000403a4e <+478>: mov %r15,%rdi 0x0000000000403a51 <+481>: mov %rcx,%rdx 0x0000000000403a54 <+484>: mov %rcx,%rsi 0x0000000000403a57 <+487>: mov %rcx,0x8(%rsp) 0x0000000000403a5c <+492>: callq 0x41ae60 <_ZNSs10_C_get_repEmm> 0x0000000000403a61 <+497>: mov 0x8(%rsp),%rcx 0x0000000000403a66 <+502>: mov 0x48(%rbx),%rsi 0x0000000000403a6a <+506>: lea 0x40(%rax),%rdi 0x0000000000403a6e <+510>: mov %rdi,0x20(%rsp) 0x0000000000403a73 <+515>: mov %rcx,%rdx 0x0000000000403a76 <+518>: callq 0x403458 <memcpy@plt> 0x0000000000403a7b <+523>: jmpq 0x40390c <f+156> 0x0000000000403a80 <+528>: mov 0x20(%rsp),%rax 0x0000000000403a85 <+533>: mov -0x10(%rax),%rsi 0x0000000000403a89 <+537>: lea -0x40(%rax),%rdi 0x0000000000403a8d <+541>: add $0x42,%rsi 0x0000000000403a91 <+545>: mov %rsi,0x8(%rsp) 0x0000000000403a96 <+550>: callq 0x403698 <pthread_mutex_destroy@plt> 0x0000000000403a9b <+555>: mov 0x20(%rsp),%rdi 0x0000000000403aa0 <+560>: mov 0x8(%rsp),%rsi 0x0000000000403aa5 <+565>: xor %edx,%edx 0x0000000000403aa7 <+567>: sub $0x40,%rdi 0x0000000000403aab <+571>: callq 0x408170 <_ZN4__rw15__rw_deallocateEPvmi> 0x0000000000403ab0 <+576>: jmpq 0x40398b <f+283> 0x0000000000403ab5 <+581>: nopl (%rax) 0x0000000000403ab8 <+584>: callq 0x403258 <strlen@plt> 0x0000000000403abd <+589>: add %rax,%r12 0x0000000000403ac0 <+592>: jmpq 0x40398b <f+283> 0x0000000000403ac5 <+597>: nopl (%rax) 0x0000000000403ac8 <+600>: mov 0x10(%rsp),%rax 0x0000000000403acd <+605>: mov -0x10(%rax),%rsi 0x0000000000403ad1 <+609>: lea -0x40(%rax),%rdi 0x0000000000403ad5 <+613>: add $0x42,%rsi 0x0000000000403ad9 <+617>: mov %rsi,0x8(%rsp) 0x0000000000403ade <+622>: callq 0x403698 <pthread_mutex_destroy@plt> 0x0000000000403ae3 <+627>: mov 0x10(%rsp),%rdi 0x0000000000403ae8 <+632>: mov 0x8(%rsp),%rsi 0x0000000000403aed <+637>: xor %edx,%edx 0x0000000000403aef <+639>: sub $0x40,%rdi 0x0000000000403af3 <+643>: callq 0x408170 <_ZN4__rw15__rw_deallocateEPvmi> 0x0000000000403af8 <+648>: jmpq 0x403a2f <f+447> 0x0000000000403afd <+653>: nopl (%rax) 0x0000000000403b00 <+656>: mov %r12,%rax 0x0000000000403b03 <+659>: add $0x38,%rsp 0x0000000000403b07 <+663>: pop %rbx 0x0000000000403b08 <+664>: pop %rbp 0x0000000000403b09 <+665>: pop %r12 0x0000000000403b0b <+667>: pop %r13 0x0000000000403b0d <+669>: pop %r14 0x0000000000403b0f <+671>: pop %r15 0x0000000000403b11 <+673>: retq 0x0000000000403b12 <+674>: xor %eax,%eax 0x0000000000403b14 <+676>: jmp 0x403b03 <f+659> 0x0000000000403b16 <+678>: mov %rax,%rbx 0x0000000000403b19 <+681>: mov %r13,%rdi 0x0000000000403b1c <+684>: callq 0x40ff40 <_ZNSsD2Ev> 0x0000000000403b21 <+689>: mov %rbx,%rdi 0x0000000000403b24 <+692>: callq 0x4036b8 <_Unwind_Resume@plt>
Dump of assembler code for function f: 0x0000000000403870 <+0>: push %r15 0x0000000000403872 <+2>: push %r14 0x0000000000403874 <+4>: push %r13 0x0000000000403876 <+6>: push %r12 0x0000000000403878 <+8>: mov %rdi,%r12 0x000000000040387b <+11>: push %rbp 0x000000000040387c <+12>: push %rbx 0x000000000040387d <+13>: sub $0x28,%rsp 0x0000000000403881 <+17>: nopl 0x0(%rax) 0x0000000000403888 <+24>: movzbl 0x2579d1(%rip),%eax # 0x65b260 <_ZL5pwait> 0x000000000040388f <+31>: test %al,%al 0x0000000000403891 <+33>: jne 0x403888 <f+24> 0x0000000000403893 <+35>: xor %eax,%eax 0x0000000000403895 <+37>: cmpq $0x0,0x2579b3(%rip) # 0x65b250 <_ZL6nloops> 0x000000000040389d <+45>: jle 0x403973 <f+259> 0x00000000004038a3 <+51>: xor %ebx,%ebx 0x00000000004038a5 <+53>: xor %ebp,%ebp 0x00000000004038a7 <+55>: lea 0x10(%rsp),%r13 0x00000000004038ac <+60>: jmp 0x4038cc <f+92> 0x00000000004038ae <+62>: xchg %ax,%ax 0x00000000004038b0 <+64>: add $0x1,%ebx 0x00000000004038b3 <+67>: movq $0x0,0x10(%rsp) 0x00000000004038bc <+76>: movslq %ebx,%rax 0x00000000004038bf <+79>: cmp 0x25798a(%rip),%rax # 0x65b250 <_ZL6nloops> 0x00000000004038c6 <+86>: jge 0x403970 <f+256> 0x00000000004038cc <+92>: mov (%r12),%rax 0x00000000004038d0 <+96>: mov %r12,%rsi 0x00000000004038d3 <+99>: mov %r13,%rdi 0x00000000004038d6 <+102>: callq *(%rax) 0x00000000004038d8 <+104>: mov 0x10(%rsp),%r14 0x00000000004038dd <+109>: lea -0x40(%r14),%r15 0x00000000004038e1 <+113>: mov %r14,%rdi 0x00000000004038e4 <+116>: callq 0x403258 <strlen@plt> 0x00000000004038e9 <+121>: add %rax,%rbp 0x00000000004038ec <+124>: cmp $0x65fae0,%r15 0x00000000004038f3 <+131>: je 0x4038b0 <f+64> 0x00000000004038f5 <+133>: mov %r15,%rdi 0x00000000004038f8 <+136>: callq 0x403658 <pthread_mutex_lock@plt> 0x00000000004038fd <+141>: test %eax,%eax 0x00000000004038ff <+143>: je 0x403912 <f+162> 0x0000000000403901 <+145>: mov $0x43fdc4,%esi 0x0000000000403906 <+150>: mov $0xa,%edi 0x000000000040390b <+155>: xor %eax,%eax 0x000000000040390d <+157>: callq 0x404090 <_ZN4__rw10__rw_throwEiz> 0x0000000000403912 <+162>: mov -0x18(%r14),%edx 0x0000000000403916 <+166>: test %r15,%r15 0x0000000000403919 <+169>: lea -0x1(%rdx),%eax 0x000000000040391c <+172>: mov %eax,-0x18(%r14) 0x0000000000403920 <+176>: je 0x403932 <f+194> 0x0000000000403922 <+178>: mov %r15,%rdi 0x0000000000403925 <+181>: mov %edx,0x8(%rsp) 0x0000000000403929 <+185>: callq 0x4036c8 <pthread_mutex_unlock@plt> 0x000000000040392e <+190>: mov 0x8(%rsp),%edx 0x0000000000403932 <+194>: test %edx,%edx 0x0000000000403934 <+196>: jg 0x4038b0 <f+64> 0x000000000040393a <+202>: mov 0x10(%rsp),%rax 0x000000000040393f <+207>: lea -0x40(%rax),%rdi 0x0000000000403943 <+211>: mov -0x10(%rax),%r14 0x0000000000403947 <+215>: callq 0x403698 <pthread_mutex_destroy@plt> 0x000000000040394c <+220>: mov 0x10(%rsp),%rdi 0x0000000000403951 <+225>: add $0x42,%r14 0x0000000000403955 <+229>: xor %edx,%edx 0x0000000000403957 <+231>: mov %r14,%rsi 0x000000000040395a <+234>: sub $0x40,%rdi 0x000000000040395e <+238>: callq 0x407e90 <_ZN4__rw15__rw_deallocateEPvmi> 0x0000000000403963 <+243>: jmpq 0x4038b0 <f+64> 0x0000000000403968 <+248>: nopl 0x0(%rax,%rax,1) 0x0000000000403970 <+256>: mov %rbp,%rax 0x0000000000403973 <+259>: add $0x28,%rsp 0x0000000000403977 <+263>: pop %rbx 0x0000000000403978 <+264>: pop %rbp 0x0000000000403979 <+265>: pop %r12 0x000000000040397b <+267>: pop %r13 0x000000000040397d <+269>: pop %r14 0x000000000040397f <+271>: pop %r15 0x0000000000403981 <+273>: retq End of assembler dump.