Hi,

Currently, malloc scans the bits of the chunk bitmap from position
zero, skipping a random number (n) of free slots and then picking the
next free one. This slows things down, especially if the number of
full slots diminishes. 


This diff changes the scannning to start at a random position in the
bitmap and then taking the first available free slot, wrapping if the
end of the bitmap is reached. Of course we'll still scan more if the
bitmap becomes more full, but the extra iterations skipping free slots
and then some full slots are avoided. 

The random number is derived from a global, which is incremented by a
few bits every time a chunk is needed (with a small optimization if
only one free slot is left).

The shows good speedups, especially if lots of small chunks are
allocated and on e.g. loongson which uses a larger page size than
others, while randomness of chunk allocation is preserved. 

Please test & review,

        -Otto

Index: malloc.c
===================================================================
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.129
diff -u -p -r1.129 malloc.c
--- malloc.c    30 Apr 2011 15:46:46 -0000      1.129
+++ malloc.c    2 May 2011 07:27:54 -0000
@@ -113,6 +113,7 @@ struct dir_info {
        struct region_info free_regions[MALLOC_MAXCACHE];
                                        /* delayed free chunk slots */
        void *delayed_chunks[MALLOC_DELAYED_CHUNKS + 1];
+       u_short chunk_start;
 #ifdef MALLOC_STATS
        size_t inserts;
        size_t insert_collisions;
@@ -1013,40 +1014,31 @@ malloc_bytes(struct dir_info *d, size_t 
 
        if (bp->canary != d->canary1)
                wrterror("chunk info corrupted", NULL);
-       /* Find first word of bitmap which isn't empty */
-       for (lp = bp->bits; !*lp; lp++)
-               /* EMPTY */;
-
-       /* Find that bit, and tweak it */
-       u = 1;
-       k = 0;
-       while (!(*lp & u)) {
-               u += u;
-               k++;
-       }
-
-       /* advance a random # of positions */
-       if (bp->free > 1) {
-               i = getrnibble() % bp->free;
-               while (i > 0) {
-                       u += u;
-                       k++;
-                       if (k >= MALLOC_BITS) {
-                               while (!*++lp)
-                                       /* EMPTY */;
-                               u = 1;
-                               k = 0;
-                               if (lp - bp->bits > (bp->total - 1) /
-                                   MALLOC_BITS) {
-                                       wrterror("chunk overflow", NULL);
-                                       errno = EFAULT;
-                                       return (NULL);
-                               }
-                       }
-                       if (*lp & u)
-                               i--;
+
+       i = d->chunk_start;
+       if (bp->free > 1)
+               i += getrnibble();
+       if (i >= bp->total)
+               i &= bp->total - 1;
+       for (;;) {
+               for (;;) {
+                       lp = &bp->bits[i / MALLOC_BITS];
+                       if (!*lp) {
+                               i += MALLOC_BITS;
+                               i &= ~(MALLOC_BITS - 1); 
+                               if (i >= bp->total)
+                                       i = 0;
+                       } else
+                               break;
                }
+               k = i % MALLOC_BITS;
+               u = 1 << k;
+               if (*lp & u)
+                       break;
+               if (++i >= bp->total)
+                       i = 0;
        }
+       d->chunk_start += i + 1;
 
        *lp ^= u;

Reply via email to