On 2019-01-15 15:46, Alex Bennée wrote: > > Peter Maydell <peter.mayd...@linaro.org> writes: > >> On Mon, 14 Jan 2019 at 22:48, Alex Bennée <alex.ben...@linaro.org> wrote: >>> >>> >>> Richard Henderson <r...@twiddle.net> writes: >>>> But perhaps >>>> >>>> unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; >>>> *r = n % d; >>>> return n / d; >>>> >>>> will allow the compiler to do what the assembly does for some 64-bit >>>> hosts. >>> >>> I wonder how much cost is incurred by the jumping to the (libgcc?) div >>> helper? Anyone got an s390x about so we can benchmark the two >>> approaches? >> >> The project has an s390x system available; however it's usually >> running merge build tests so not so useful for benchmarking. >> (I can set up accounts on it but that requires me to faff about >> figuring out how to create new accounts :-)) > > I'm happy to leave this up to those who care about s390x host > performance (Thomas, Cornelia?). I'm just keen to avoid the divide > helper getting too #ifdefy.
Ok, I just did a quick'n'dirty "benchmark" on the s390x that I've got available: #include <stdio.h> #include <time.h> #include <stdint.h> uint64_t udiv_qrnnd1(uint64_t *r, uint64_t n1, uint64_t n0, uint64_t d) { unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; asm("dlgr %0, %1" : "+r"(n) : "r"(d)); *r = n >> 64; return n; } uint64_t udiv_qrnnd2(uint64_t *r, uint64_t n1, uint64_t n0, uint64_t d) { unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; *r = n % d; return n / d; } uint64_t udiv_qrnnd3(uint64_t *r, uint64_t n1, uint64_t n0, uint64_t d) { uint64_t d0, d1, q0, q1, r1, r0, m; d0 = (uint32_t)d; d1 = d >> 32; r1 = n1 % d1; q1 = n1 / d1; m = q1 * d0; r1 = (r1 << 32) | (n0 >> 32); if (r1 < m) { q1 -= 1; r1 += d; if (r1 >= d) { if (r1 < m) { q1 -= 1; r1 += d; } } } r1 -= m; r0 = r1 % d1; q0 = r1 / d1; m = q0 * d0; r0 = (r0 << 32) | (uint32_t)n0; if (r0 < m) { q0 -= 1; r0 += d; if (r0 >= d) { if (r0 < m) { q0 -= 1; r0 += d; } } } r0 -= m; *r = r0; return (q1 << 32) | q0; } int main() { uint64_t r = 0, n1 = 0, n0 = 0, d = 0; uint64_t rs = 0, rn = 0; clock_t start, end; long i; start = clock(); for (i=0; i<200000000L; i++) { n1 += 3; n0 += 987654321; d += 0x123456789; rs += udiv_qrnnd1(&r, n1, n0, d); rn += r; } end = clock(); printf("test 1: time=%li\t, rs=%li , rn = %li\n", (end-start)/1000, rs, rn); r = n1 = n0 = d = rs = rn = 0; start = clock(); for (i=0; i<200000000L; i++) { n1 += 3; n0 += 987654321; d += 0x123456789; rs += udiv_qrnnd2(&r, n1, n0, d); rn += r; } end = clock(); printf("test 2: time=%li\t, rs=%li , rn = %li\n", (end-start)/1000, rs, rn); r = n1 = n0 = d = rs = rn = 0; start = clock(); for (i=0; i<200000000L; i++) { n1 += 3; n0 += 987654321; d += 0x123456789; rs += udiv_qrnnd3(&r, n1, n0, d); rn += r; } end = clock(); printf("test 3: time=%li\t, rs=%li , rn = %li\n", (end-start)/1000, rs, rn); return 0; } ... and results with GCC v8.2.1 are (using -O2): test 1: time=609 , rs=2264924160200000000 , rn = 6136218997527160832 test 2: time=10127 , rs=2264924160200000000 , rn = 6136218997527160832 test 3: time=2350 , rs=2264924183048928865 , rn = 4842822048162311089 Thus the int128 version is the slowest! ... but at least it gives the same results as the DLGR instruction. The 64-bit version gives different results - do we have a bug here? Results with Clang v7.0.1 (using -O2, too) are these: test 2: time=5035 , rs=2264924160200000000 , rn = 6136218997527160832 test 3: time=1970 , rs=2264924183048928865 , rn = 4842822048162311089 Thomas