Gitweb:     
http://git.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=4277eedd7908a0ca8b66fad46ee76b0ad96e6ef2
Commit:     4277eedd7908a0ca8b66fad46ee76b0ad96e6ef2
Parent:     b39a734097d5095d63eb9c709a6aaf965633bb01
Author:     Denis Vlasenko <[EMAIL PROTECTED]>
AuthorDate: Sun Jul 15 23:41:56 2007 -0700
Committer:  Linus Torvalds <[EMAIL PROTECTED]>
CommitDate: Mon Jul 16 09:05:52 2007 -0700

    vsprintf.c: optimizing, part 2: base 10 conversion speedup, v2
    
    Optimize integer-to-string conversion in vsprintf.c for base 10.  This is
    by far the most used conversion, and in some use cases it impacts
    performance.  For example, top reads /proc/$PID/stat for every process, and
    with 4000 processes decimal conversion alone takes noticeable time.
    
    Using code from
    
    http://www.cs.uiowa.edu/~jones/bcd/decimal.html
    (with permission from the author, Douglas W. Jones)
    
    binary-to-decimal-string conversion is done in groups of five digits at
    once, using only additions/subtractions/shifts (with -O2; -Os throws in
    some multiply instructions).
    
    On i386 arch gcc 4.1.2 -O2 generates ~500 bytes of code.
    
    This patch is run tested. Userspace benchmark/test is also attached.
    I tested it on PIII and AMD64 and new code is generally ~2.5 times
    faster. On AMD64:
    
    # ./vsprintf_verify-O2
    Original decimal conv: .......... 151 ns per iteration
    Patched decimal conv:  .......... 62 ns per iteration
    Testing correctness
    12895992590592 ok...        [Ctrl-C]
    # ./vsprintf_verify-O2
    Original decimal conv: .......... 151 ns per iteration
    Patched decimal conv:  .......... 62 ns per iteration
    Testing correctness
    26025406464 ok...        [Ctrl-C]
    
    More realistic test: top from busybox project was modified to
    report how many us it took to scan /proc (this does not account
    any processing done after that, like sorting process list),
    and then I test it with 4000 processes:
    
    #!/bin/sh
    i=4000
    while test $i != 0; do
        sleep 30 &
        let i--
    done
    busybox top -b -n3 >/dev/null
    
    on unpatched kernel:
    
    top: 4120 processes took 102864 microseconds to scan
    top: 4120 processes took 91757 microseconds to scan
    top: 4120 processes took 92517 microseconds to scan
    top: 4120 processes took 92581 microseconds to scan
    
    on patched kernel:
    
    top: 4120 processes took 75460 microseconds to scan
    top: 4120 processes took 66451 microseconds to scan
    top: 4120 processes took 67267 microseconds to scan
    top: 4120 processes took 67618 microseconds to scan
    
    The speedup comes from much faster generation of /proc/PID/stat
    by sprintf() calls inside the kernel.
    
    Signed-off-by: Douglas W Jones <[EMAIL PROTECTED]>
    Signed-off-by: Denys Vlasenko <[EMAIL PROTECTED]>
    Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
    Signed-off-by: Linus Torvalds <[EMAIL PROTECTED]>
---
 lib/vsprintf.c |  108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index e94b4bd..6b6734d 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -135,6 +135,103 @@ static int skip_atoi(const char **s)
        return i;
 }
 
+/* Decimal conversion is by far the most typical, and is used
+ * for /proc and /sys data. This directly impacts e.g. top performance
+ * with many processes running. We optimize it for speed
+ * using code from
+ * http://www.cs.uiowa.edu/~jones/bcd/decimal.html
+ * (with permission from the author, Douglas W. Jones). */
+
+/* Formats correctly any integer in [0,99999].
+ * Outputs from one to five digits depending on input.
+ * On i386 gcc 4.1.2 -O2: ~250 bytes of code. */
+static char* put_dec_trunc(char *buf, unsigned q)
+{
+       unsigned d3, d2, d1, d0;
+       d1 = (q>>4) & 0xf;
+       d2 = (q>>8) & 0xf;
+       d3 = (q>>12);
+
+       d0 = 6*(d3 + d2 + d1) + (q & 0xf);
+       q = (d0 * 0xcd) >> 11;
+       d0 = d0 - 10*q;
+       *buf++ = d0 + '0'; /* least significant digit */
+       d1 = q + 9*d3 + 5*d2 + d1;
+       if (d1 != 0) {
+               q = (d1 * 0xcd) >> 11;
+               d1 = d1 - 10*q;
+               *buf++ = d1 + '0'; /* next digit */
+
+               d2 = q + 2*d2;
+               if ((d2 != 0) || (d3 != 0)) {
+                       q = (d2 * 0xd) >> 7;
+                       d2 = d2 - 10*q;
+                       *buf++ = d2 + '0'; /* next digit */
+
+                       d3 = q + 4*d3;
+                       if (d3 != 0) {
+                               q = (d3 * 0xcd) >> 11;
+                               d3 = d3 - 10*q;
+                               *buf++ = d3 + '0';  /* next digit */
+                               if (q != 0)
+                                       *buf++ = q + '0';  /* most sign. digit 
*/
+                       }
+               }
+       }
+       return buf;
+}
+/* Same with if's removed. Always emits five digits */
+static char* put_dec_full(char *buf, unsigned q)
+{
+       /* BTW, if q is in [0,9999], 8-bit ints will be enough, */
+       /* but anyway, gcc produces better code with full-sized ints */
+       unsigned d3, d2, d1, d0;
+       d1 = (q>>4) & 0xf;
+       d2 = (q>>8) & 0xf;
+       d3 = (q>>12);
+
+       /* Possible ways to approx. divide by 10 */
+       /* gcc -O2 replaces multiply with shifts and adds */
+       // (x * 0xcd) >> 11: 11001101 - shorter code than * 0x67 (on i386)
+       // (x * 0x67) >> 10:  1100111
+       // (x * 0x34) >> 9:    110100 - same
+       // (x * 0x1a) >> 8:     11010 - same
+       // (x * 0x0d) >> 7:      1101 - same, shortest code (on i386)
+
+       d0 = 6*(d3 + d2 + d1) + (q & 0xf);
+       q = (d0 * 0xcd) >> 11;
+       d0 = d0 - 10*q;
+       *buf++ = d0 + '0';
+       d1 = q + 9*d3 + 5*d2 + d1;
+               q = (d1 * 0xcd) >> 11;
+               d1 = d1 - 10*q;
+               *buf++ = d1 + '0';
+
+               d2 = q + 2*d2;
+                       q = (d2 * 0xd) >> 7;
+                       d2 = d2 - 10*q;
+                       *buf++ = d2 + '0';
+
+                       d3 = q + 4*d3;
+                               q = (d3 * 0xcd) >> 11; /* - shorter code */
+                               /* q = (d3 * 0x67) >> 10; - would also work */
+                               d3 = d3 - 10*q;
+                               *buf++ = d3 + '0';
+                                       *buf++ = q + '0';
+       return buf;
+}
+/* No inlining helps gcc to use registers better */
+static noinline char* put_dec(char *buf, unsigned long long num)
+{
+       while (1) {
+               unsigned rem;
+               if (num < 100000)
+                       return put_dec_trunc(buf, num);
+               rem = do_div(num, 100000);
+               buf = put_dec_full(buf, rem);
+       }
+}
+
 #define ZEROPAD        1               /* pad with zero */
 #define SIGN   2               /* unsigned/signed long */
 #define PLUS   4               /* show plus */
@@ -182,6 +279,11 @@ static char *number(char *buf, char *end, unsigned long 
long num, int base, int
        i = 0;
        if (num == 0)
                tmp[i++] = '0';
+       /* Generic code, for any base:
+       else do {
+               tmp[i++] = digits[do_div(num,base)];
+       } while (num != 0);
+       */
        else if (base != 10) { /* 8 or 16 */
                int mask = base - 1;
                int shift = 3;
@@ -190,9 +292,9 @@ static char *number(char *buf, char *end, unsigned long 
long num, int base, int
                        tmp[i++] = digits[((unsigned char)num) & mask];
                        num >>= shift;
                } while (num);
-       } else do { /* generic code, works for any base */
-               tmp[i++] = digits[do_div(num,10 /*base*/)];
-       } while (num);
+       } else { /* base 10 */
+               i = put_dec(tmp, num) - tmp;
+       }
 
        /* printing 100 using %2d gives "100", not "00" */
        if (i > precision)
-
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to