On Tue, Apr 05, 2016 at 09:07:57 -0700, Richard Henderson wrote: > On 04/05/2016 08:48 AM, Paolo Bonzini wrote: > >I think it's fine to use the struct. The exact size of the struct > >varies from 3 to 5 32-bit words, so it's hard to write nice > >size-dependent code for the hash. > > I don't think it is. We have 3 integers. It is trivial to create a simple > function of 2 multiplies, two adds, and a remainder. > > Take the primes from the xxhash.h, for example: > > (phys_pc * PRIME32_2 + pc * PRIME32_3 + flags) > % PRIME32_1 > & (CODE_GEN_PHYS_HASH_SIZE - 1) > > Obviously, some bucket measurements should be taken, but I can well imagine > that this might perform just as well as the fully generic hasher.
That function doesn't perform well: 25.06s vs. 21.18s with xxh32. Having the packed struct and passing it to an *inlined* xxhash is virtually unbeatable; gcc (>=v4.6, dunno about older ones) optimizes the inline function since it knows the size of the struct. To show this I'm appending the generated code for tb_hash_func when xxh32 is inlined vs. when it is not, for x86_64-softmmu. Results are similar for arm-softmmu. Anyway (for the arm bootup test) we're talking about ~0.50% of runtime spent in tb_hash_func (with xxh32 inlined), so whatever we did here could not improve overall performance much. Thanks, Emilio * no inline: 00000000001a4e60 <qemu_xxh32>: 1a4e60: 48 83 ec 18 sub $0x18,%rsp 1a4e64: 4c 8d 0c b7 lea (%rdi,%rsi,4),%r9 1a4e68: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax 1a4e6f: 00 00 1a4e71: 48 89 44 24 08 mov %rax,0x8(%rsp) 1a4e76: 31 c0 xor %eax,%eax 1a4e78: 48 83 fe 03 cmp $0x3,%rsi 1a4e7c: 8d 82 b1 67 56 16 lea 0x165667b1(%rdx),%eax 1a4e82: 0f 86 92 00 00 00 jbe 1a4f1a <qemu_xxh32+0xba> 1a4e88: 4d 8d 59 f0 lea -0x10(%r9),%r11 1a4e8c: 44 8d 82 28 44 23 24 lea 0x24234428(%rdx),%r8d 1a4e93: 8d 8a 77 ca eb 85 lea -0x7a143589(%rdx),%ecx 1a4e99: 8d 82 4f 86 c8 61 lea 0x61c8864f(%rdx),%eax 1a4e9f: 90 nop 1a4ea0: 44 8b 17 mov (%rdi),%r10d 1a4ea3: 45 69 d2 77 ca eb 85 imul $0x85ebca77,%r10d,%r10d 1a4eaa: 45 01 d0 add %r10d,%r8d 1a4ead: 44 8b 57 04 mov 0x4(%rdi),%r10d 1a4eb1: 41 c1 c0 0d rol $0xd,%r8d 1a4eb5: 45 69 c0 b1 79 37 9e imul $0x9e3779b1,%r8d,%r8d 1a4ebc: 45 69 d2 77 ca eb 85 imul $0x85ebca77,%r10d,%r10d 1a4ec3: 44 01 d1 add %r10d,%ecx 1a4ec6: 44 8b 57 08 mov 0x8(%rdi),%r10d 1a4eca: c1 c1 0d rol $0xd,%ecx 1a4ecd: 69 c9 b1 79 37 9e imul $0x9e3779b1,%ecx,%ecx 1a4ed3: 45 69 d2 77 ca eb 85 imul $0x85ebca77,%r10d,%r10d 1a4eda: 44 01 d2 add %r10d,%edx 1a4edd: 44 8b 57 0c mov 0xc(%rdi),%r10d 1a4ee1: 48 83 c7 10 add $0x10,%rdi 1a4ee5: c1 c2 0d rol $0xd,%edx 1a4ee8: 69 d2 b1 79 37 9e imul $0x9e3779b1,%edx,%edx 1a4eee: 45 69 d2 77 ca eb 85 imul $0x85ebca77,%r10d,%r10d 1a4ef5: 44 01 d0 add %r10d,%eax 1a4ef8: c1 c0 0d rol $0xd,%eax 1a4efb: 69 c0 b1 79 37 9e imul $0x9e3779b1,%eax,%eax 1a4f01: 49 39 fb cmp %rdi,%r11 1a4f04: 73 9a jae 1a4ea0 <qemu_xxh32+0x40> 1a4f06: c1 c9 19 ror $0x19,%ecx 1a4f09: 41 c1 c8 1f ror $0x1f,%r8d 1a4f0d: c1 ca 14 ror $0x14,%edx 1a4f10: 44 01 c1 add %r8d,%ecx 1a4f13: c1 c8 0e ror $0xe,%eax 1a4f16: 01 ca add %ecx,%edx 1a4f18: 01 d0 add %edx,%eax 1a4f1a: 4c 39 cf cmp %r9,%rdi 1a4f1d: 8d 34 b0 lea (%rax,%rsi,4),%esi 1a4f20: 73 22 jae 1a4f44 <qemu_xxh32+0xe4> 1a4f22: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 1a4f28: 8b 17 mov (%rdi),%edx 1a4f2a: 48 83 c7 04 add $0x4,%rdi 1a4f2e: 69 c2 3d ae b2 c2 imul $0xc2b2ae3d,%edx,%eax 1a4f34: 01 c6 add %eax,%esi 1a4f36: c1 c6 11 rol $0x11,%esi 1a4f39: 69 f6 2f eb d4 27 imul $0x27d4eb2f,%esi,%esi 1a4f3f: 49 39 f9 cmp %rdi,%r9 1a4f42: 77 e4 ja 1a4f28 <qemu_xxh32+0xc8> 1a4f44: 89 f0 mov %esi,%eax 1a4f46: c1 e8 0f shr $0xf,%eax 1a4f49: 31 f0 xor %esi,%eax 1a4f4b: 69 d0 77 ca eb 85 imul $0x85ebca77,%eax,%edx 1a4f51: 89 d0 mov %edx,%eax 1a4f53: c1 e8 0d shr $0xd,%eax 1a4f56: 31 d0 xor %edx,%eax 1a4f58: 69 d0 3d ae b2 c2 imul $0xc2b2ae3d,%eax,%edx 1a4f5e: 89 d0 mov %edx,%eax 1a4f60: c1 e8 10 shr $0x10,%eax 1a4f63: 31 d0 xor %edx,%eax 1a4f65: 48 8b 54 24 08 mov 0x8(%rsp),%rdx 1a4f6a: 64 48 33 14 25 28 00 xor %fs:0x28,%rdx 1a4f71: 00 00 1a4f73: 75 05 jne 1a4f7a <qemu_xxh32+0x11a> 1a4f75: 48 83 c4 18 add $0x18,%rsp 1a4f79: c3 retq 1a4f7a: e8 f1 7a fe ff callq 18ca70 <__stack_chk_fail@plt> 1a4f7f: 90 nop 00000000001a4f80 <tb_hash_func>: 1a4f80: 48 83 ec 28 sub $0x28,%rsp 1a4f84: 48 89 3c 24 mov %rdi,(%rsp) 1a4f88: 48 89 74 24 08 mov %rsi,0x8(%rsp) 1a4f8d: 48 89 e7 mov %rsp,%rdi 1a4f90: 89 54 24 10 mov %edx,0x10(%rsp) 1a4f94: be 05 00 00 00 mov $0x5,%esi 1a4f99: ba 01 00 00 00 mov $0x1,%edx 1a4f9e: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax 1a4fa5: 00 00 1a4fa7: 48 89 44 24 18 mov %rax,0x18(%rsp) 1a4fac: 31 c0 xor %eax,%eax 1a4fae: e8 ad fe ff ff callq 1a4e60 <qemu_xxh32> 1a4fb3: 48 8b 54 24 18 mov 0x18(%rsp),%rdx 1a4fb8: 64 48 33 14 25 28 00 xor %fs:0x28,%rdx 1a4fbf: 00 00 1a4fc1: 75 05 jne 1a4fc8 <tb_hash_func+0x48> 1a4fc3: 48 83 c4 28 add $0x28,%rsp 1a4fc7: c3 retq 1a4fc8: e8 a3 7a fe ff callq 18ca70 <__stack_chk_fail@plt> 1a4fcd: 0f 1f 00 nopl (%rax) * inline: 00000000001a6800 <tb_hash_func>: 1a6800: 48 83 ec 28 sub $0x28,%rsp 1a6804: 69 cf 77 ca eb 85 imul $0x85ebca77,%edi,%ecx 1a680a: 48 89 3c 24 mov %rdi,(%rsp) 1a680e: 48 c1 ef 20 shr $0x20,%rdi 1a6812: 69 ff 77 ca eb 85 imul $0x85ebca77,%edi,%edi 1a6818: 48 89 74 24 08 mov %rsi,0x8(%rsp) 1a681d: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax 1a6824: 00 00 1a6826: 48 89 44 24 18 mov %rax,0x18(%rsp) 1a682b: 31 c0 xor %eax,%eax 1a682d: 81 c1 29 44 23 24 add $0x24234429,%ecx 1a6833: 69 c6 77 ca eb 85 imul $0x85ebca77,%esi,%eax 1a6839: 48 c1 ee 20 shr $0x20,%rsi 1a683d: 81 ef 88 35 14 7a sub $0x7a143588,%edi 1a6843: 69 f6 77 ca eb 85 imul $0x85ebca77,%esi,%esi 1a6849: c1 c9 13 ror $0x13,%ecx 1a684c: c1 cf 13 ror $0x13,%edi 1a684f: 83 c0 01 add $0x1,%eax 1a6852: 69 c9 b1 79 37 9e imul $0x9e3779b1,%ecx,%ecx 1a6858: c1 c8 13 ror $0x13,%eax 1a685b: 81 c6 50 86 c8 61 add $0x61c88650,%esi 1a6861: 69 ff b1 79 37 9e imul $0x9e3779b1,%edi,%edi 1a6867: c1 ce 13 ror $0x13,%esi 1a686a: c1 c9 1f ror $0x1f,%ecx 1a686d: 69 c0 b1 79 37 9e imul $0x9e3779b1,%eax,%eax 1a6873: c1 cf 19 ror $0x19,%edi 1a6876: 69 f6 b1 79 37 9e imul $0x9e3779b1,%esi,%esi 1a687c: 8d 7c 39 14 lea 0x14(%rcx,%rdi,1),%edi 1a6880: c1 c8 14 ror $0x14,%eax 1a6883: 69 d2 3d ae b2 c2 imul $0xc2b2ae3d,%edx,%edx 1a6889: 01 f8 add %edi,%eax 1a688b: c1 ce 0e ror $0xe,%esi 1a688e: 01 c6 add %eax,%esi 1a6890: 01 f2 add %esi,%edx 1a6892: c1 ca 0f ror $0xf,%edx 1a6895: 69 d2 2f eb d4 27 imul $0x27d4eb2f,%edx,%edx 1a689b: 89 d0 mov %edx,%eax 1a689d: c1 e8 0f shr $0xf,%eax 1a68a0: 31 d0 xor %edx,%eax 1a68a2: 69 d0 77 ca eb 85 imul $0x85ebca77,%eax,%edx 1a68a8: 89 d0 mov %edx,%eax 1a68aa: c1 e8 0d shr $0xd,%eax 1a68ad: 31 d0 xor %edx,%eax 1a68af: 69 d0 3d ae b2 c2 imul $0xc2b2ae3d,%eax,%edx 1a68b5: 89 d0 mov %edx,%eax 1a68b7: c1 e8 10 shr $0x10,%eax 1a68ba: 31 d0 xor %edx,%eax 1a68bc: 48 8b 54 24 18 mov 0x18(%rsp),%rdx 1a68c1: 64 48 33 14 25 28 00 xor %fs:0x28,%rdx 1a68c8: 00 00 1a68ca: 75 05 jne 1a68d1 <tb_hash_func+0xd1> 1a68cc: 48 83 c4 28 add $0x28,%rsp 1a68d0: c3 retq 1a68d1: e8 9a 61 fe ff callq 18ca70 <__stack_chk_fail@plt> 1a68d6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) 1a68dd: 00 00 00