Re: request for testing: malloc and large allocations

2022-02-25 Thread Otto Moerbeek
On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote:

> On Fri, Jan 28, 2022 at 05:17:48PM +0100, Otto Moerbeek wrote:
> 
> > On Fri, Jan 28, 2022 at 04:33:28PM +0100, Alexander Bluhm wrote:
> > 
> > > On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote:
> > > > currently malloc does cache a number of free'ed regions up to 128k in
> > > > size. This cache is indexed by size (in # of pages), so it is very
> > > > quick to check.
> > > >
> > > > Some programs allocate and deallocate larger allocations in a frantic
> > > > way.  Accodomate those programs by also keeping a cache of regions
> > > > betwen 128k and 2M, in a cache of variable sized regions.
> > > >
> > > > My test case speeds up about twice. A make build gets a small speedup.
> > > >
> > > > This has been tested by myself on amd64 quite intensively. I am asking
> > > > for more tests, especialy on more "exotic" platforms. I wil do arm64
> > > > myself soon.  Test can be running your favorite programs, doing make
> > > > builds or running tests in regress/lib/libc/malloc.
> > > 
> > > I see openssl and tmux crash with this diff.
> > > /usr/src/regress/usr.sbin/openssl reproduces it on arm64, amd64,
> > > i386.
> > 
> > Are you running with any malloc flags?
> 
> This bug report enabled me to find a bug that would pop up if G mode
> is enabled.
> 
> New diff below. New tests appreciated.

This has been in snaps for a while.

Any body willing to review and OK?

-Otto


> Index: stdlib/malloc.c
> ===
> RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
> retrieving revision 1.272
> diff -u -p -r1.272 malloc.c
> --- stdlib/malloc.c   19 Sep 2021 09:15:22 -  1.272
> +++ stdlib/malloc.c   31 Jan 2022 16:27:31 -
> @@ -113,13 +113,27 @@ struct region_info {
>  
>  LIST_HEAD(chunk_head, chunk_info);
>  
> -#define MAX_CACHEABLE_SIZE   32
> -struct cache {
> - void *pages[MALLOC_MAXCACHE];
> +/*
> + * Two caches, one for "small" regions, one for "big".
> + * Small cache is an array per size, big cache is one array with different
> + * sized regions
> + */
> +#define MAX_SMALLCACHEABLE_SIZE  32
> +#define MAX_BIGCACHEABLE_SIZE512
> +/* If the total # of pages is larger than this, evict before inserting */
> +#define BIGCACHE_FILL(sz)(MAX_BIGCACHEABLE_SIZE * (sz) / 4)
> +
> +struct smallcache {
> + void **pages;
>   ushort length;
>   ushort max;
>  };
>  
> +struct bigcache {
> + void *page;
> + size_t psize;
> +};
> +
>  struct dir_info {
>   u_int32_t canary1;
>   int active; /* status of malloc */
> @@ -139,7 +153,10 @@ struct dir_info {
>   void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1];
>   u_char rbytes[32];  /* random bytes */
>   /* free pages cache */
> - struct cache cache[MAX_CACHEABLE_SIZE];
> + struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE];
> + size_t bigcache_used;
> + size_t bigcache_size;
> + struct bigcache *bigcache;
>  #ifdef MALLOC_STATS
>   size_t inserts;
>   size_t insert_collisions;
> @@ -207,7 +224,7 @@ struct malloc_readonly {
>  #ifdef MALLOC_STATS
>   int malloc_stats;   /* dump statistics at end */
>  #endif
> - u_int32_t malloc_canary;/* Matched against ones in malloc_pool 
> */
> + u_int32_t malloc_canary;/* Matched against ones in pool */
>  };
>  
>  /* This object is mapped PROT_READ after initialisation to prevent tampering 
> */
> @@ -714,18 +731,61 @@ unmap(struct dir_info *d, void *p, size_
>   size_t psz = sz >> MALLOC_PAGESHIFT;
>   void *r;
>   u_short i;
> - struct cache *cache;
> + struct smallcache *cache;
>  
>   if (sz != PAGEROUND(sz) || psz == 0)
>   wrterror(d, "munmap round");
>  
> - if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) {
> + if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE &&
> + psz <= MAX_BIGCACHEABLE_SIZE) {
> + u_short base = getrbyte(d);
> + u_short j;
> +
> + /* don't look through all slots */
> + for (j = 0; j < d->bigcache_size / 4; j++) {
> + i = (base + j) % d->bigcache_size;
> + if (d->bigcache_used <
> + BIGCACHE_FILL(d->bigcache_size))  {
> + if (d->bigcache[i].psize == 0)
> + break;
> + } else {
> + if (d->bigcache[i].psize != 0)
> + break;
> + }
> + }
> + /* if we didn't find a preferred slot, use random one */
> + if (d->bigcache[i].psize != 0) {
> + size_t tmp;
> +
> + r = d->bigcache[i].page;
> + d->bigcache_used -= d->bigcache[i].psize;
> 

Re: request for testing: malloc and large allocations

2022-02-05 Thread Otto Moerbeek
On Sat, Feb 05, 2022 at 08:07:42PM +0100, Jan Stary wrote:

> On Feb 05 17:35:46, o...@drijf.net wrote:
> > On Sat, Feb 05, 2022 at 05:22:50PM +0100, Jan Stary wrote:
> > 
> > > On Feb 02 00:04:37, alexander.bl...@gmx.net wrote:
> > > > On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote:
> > > > > > Are you running with any malloc flags?
> > > > > 
> > > > > This bug report enabled me to find a bug that would pop up if G mode
> > > > > is enabled.
> > > > > 
> > > > > New diff below. New tests appreciated.
> > > 
> > > 
> > > Passed a make build on macppc (Mac Mini, dmesg below).
> > > 
> > > 4288m32.43s real  3309m41.24s user   802m02.37s system
> > > Recompiling now with the new malloc to see the difference.
> > 
> > Note that during a make build, libs get installed after building them
> > so all dynamically linked programs will use the new malloc from that
> > point, as there is no revision bump.
> 
> Does that mean that both passes of compiling the compiler
> (i.e., compiling clang with the current clang,
> and then compiling clang with the new clang,
> if I remember correctly) already use the new libc?
> 
> (Compiling clang has taken most of the ~3 days of make build.)

Clang does a single pass afaik, it does not do a full bootstrap.  That
means clang will be compiled with the currently installed (old) clang,
but that clang will use the newly built and installed libc.

-Otto



Re: request for testing: malloc and large allocations

2022-02-05 Thread Jan Stary
On Feb 05 17:35:46, o...@drijf.net wrote:
> On Sat, Feb 05, 2022 at 05:22:50PM +0100, Jan Stary wrote:
> 
> > On Feb 02 00:04:37, alexander.bl...@gmx.net wrote:
> > > On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote:
> > > > > Are you running with any malloc flags?
> > > > 
> > > > This bug report enabled me to find a bug that would pop up if G mode
> > > > is enabled.
> > > > 
> > > > New diff below. New tests appreciated.
> > 
> > 
> > Passed a make build on macppc (Mac Mini, dmesg below).
> > 
> > 4288m32.43s real  3309m41.24s user   802m02.37s system
> > Recompiling now with the new malloc to see the difference.
> 
> Note that during a make build, libs get installed after building them
> so all dynamically linked programs will use the new malloc from that
> point, as there is no revision bump.

Does that mean that both passes of compiling the compiler
(i.e., compiling clang with the current clang,
and then compiling clang with the new clang,
if I remember correctly) already use the new libc?

(Compiling clang has taken most of the ~3 days of make build.)

Jan

> > [ using 1308496 bytes of bsd ELF symbol table ]
> > console out [ATY,RockHopper2_A] console in [keyboard], using USB
> > using parent ATY,RockHopper2Paren:: memaddr 9800, size 800 : 
> > consaddr 9c008000 : ioaddr 9002, size 2: width 1920 linebytes 2048 
> > height 1080 depth 8
> > Copyright (c) 1982, 1986, 1989, 1991, 1993
> > The Regents of the University of California.  All rights reserved.
> > Copyright (c) 1995-2022 OpenBSD. All rights reserved.  
> > https://www.OpenBSD.org
> > 
> > OpenBSD 7.0-current (GENERIC) #0: Mon Jan 31 13:22:27 CET 2022
> > h...@ppc.stare.cz:/usr/src/sys/arch/macppc/compile/GENERIC
> > real mem = 1073741824 (1024MB)
> > avail mem = 1025527808 (978MB)
> > random: good seed from bootblocks
> > mpath0 at root
> > scsibus0 at mpath0: 256 targets
> > mainbus0 at root: model PowerMac10,2
> > cpu0 at mainbus0: 7447A (Revision 0x102): 1499 MHz: 512KB L2 cache
> > mem0 at mainbus0
> > spdmem0 at mem0: 1GB DDR SDRAM non-parity PC3200CL3.0
> > memc0 at mainbus0: uni-n rev 0xd2
> > "hw-clock" at memc0 not configured
> > kiic0 at memc0 offset 0xf8001000
> > iic0 at kiic0
> > mpcpcibr0 at mainbus0 pci: uni-north
> > pci0 at mpcpcibr0 bus 0
> > pchb0 at pci0 dev 11 function 0 "Apple UniNorth AGP" rev 0x00
> > agp at pchb0 not configured
> > radeondrm0 at pci0 dev 16 function 0 "ATI Radeon 9200" rev 0x01
> > drm0 at radeondrm0
> > radeondrm0: irq 48
> > mpcpcibr1 at mainbus0 pci: uni-north
> > pci1 at mpcpcibr1 bus 0
> > macobio0 at pci1 dev 23 function 0 "Apple Intrepid" rev 0x00
> > openpic0 at macobio0 offset 0x4: version 0x4614 feature 3f0302 LE
> > macgpio0 at macobio0 offset 0x50
> > "modem-reset" at macgpio0 offset 0x1d not configured
> > "modem-power" at macgpio0 offset 0x1c not configured
> > macgpio1 at macgpio0 offset 0x9: irq 47
> > "programmer-switch" at macgpio0 offset 0x11 not configured
> > "gpio5" at macgpio0 offset 0x6f not configured
> > "gpio6" at macgpio0 offset 0x70 not configured
> > "extint-gpio15" at macgpio0 offset 0x67 not configured
> > "escc-legacy" at macobio0 offset 0x12000 not configured
> > zs0 at macobio0 offset 0x13000: irq 22,23
> > zstty0 at zs0 channel 0
> > zstty1 at zs0 channel 1
> > aoa0 at macobio0 offset 0x1: irq 30,1,2
> > "timer" at macobio0 offset 0x15000 not configured
> > adb0 at macobio0 offset 0x16000
> > apm0 at adb0: battery flags 0x0, 0% charged
> > piic0 at adb0
> > iic1 at piic0
> > maxtmp0 at iic1 addr 0xc8: max6642
> > kiic1 at macobio0 offset 0x18000
> > iic2 at kiic1
> > wdc0 at macobio0 offset 0x2 irq 24: DMA
> > audio0 at aoa0
> > ohci0 at pci1 dev 26 function 0 "Apple Intrepid USB" rev 0x00: irq 29, 
> > version 1.0, legacy support
> > ohci1 at pci1 dev 27 function 0 "NEC USB" rev 0x43: irq 63, version 1.0
> > ohci2 at pci1 dev 27 function 1 "NEC USB" rev 0x43: irq 63, version 1.0
> > ehci0 at pci1 dev 27 function 2 "NEC USB" rev 0x04: irq 63
> > usb0 at ehci0: USB revision 2.0
> > uhub0 at usb0 configuration 1 interface 0 "NEC EHCI root hub" rev 2.00/1.00 
> > addr 1
> > usb1 at ohci0: USB revision 1.0
> > uhub1 at usb1 configuration 1 interface 0 "Apple OHCI root hub" rev 
> > 1.00/1.00 addr 1
> > usb2 at ohci1: USB revision 1.0
> > uhub2 at usb2 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 
> > addr 1
> > usb3 at ohci2: USB revision 1.0
> > uhub3 at usb3 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 
> > addr 1
> > mpcpcibr2 at mainbus0 pci: uni-north
> > pci2 at mpcpcibr2 bus 0
> > kauaiata0 at pci2 dev 13 function 0 "Apple Intrepid ATA" rev 0x00
> > wdc1 at kauaiata0 irq 39: DMA
> > wd0 at wdc1 channel 0 drive 0: 
> > wd0: 16-sector PIO, LBA48, 152627MB, 312581808 sectors
> > wd0(wdc1:0:0): using PIO mode 4, DMA mode 2, Ultra-DMA mode 5
> > "Apple UniNorth Firewire" rev 0x81 at pci2 dev 14 function 0 not configured
> > gem0 at pci2 dev 15 function 0 "Apple Uni-N2 GMAC" rev 

Re: request for testing: malloc and large allocations

2022-02-05 Thread Otto Moerbeek
On Sat, Feb 05, 2022 at 05:22:50PM +0100, Jan Stary wrote:

> On Feb 02 00:04:37, alexander.bl...@gmx.net wrote:
> > On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote:
> > > > Are you running with any malloc flags?
> > > 
> > > This bug report enabled me to find a bug that would pop up if G mode
> > > is enabled.
> > > 
> > > New diff below. New tests appreciated.
> 
> 
> Passed a make build on macppc (Mac Mini, dmesg below).
> 
> 4288m32.43s real  3309m41.24s user   802m02.37s system
> Recompiling now with the new malloc to see the difference.

Note that during a make build, libs get installed after building them
so all dynamically linked programs will use the new malloc from that
point, as there is no revision bump.

-Otto

> 
> 
>   Jan
> 
> 
> [ using 1308496 bytes of bsd ELF symbol table ]
> console out [ATY,RockHopper2_A] console in [keyboard], using USB
> using parent ATY,RockHopper2Paren:: memaddr 9800, size 800 : consaddr 
> 9c008000 : ioaddr 9002, size 2: width 1920 linebytes 2048 height 1080 
> depth 8
> Copyright (c) 1982, 1986, 1989, 1991, 1993
>   The Regents of the University of California.  All rights reserved.
> Copyright (c) 1995-2022 OpenBSD. All rights reserved.  https://www.OpenBSD.org
> 
> OpenBSD 7.0-current (GENERIC) #0: Mon Jan 31 13:22:27 CET 2022
> h...@ppc.stare.cz:/usr/src/sys/arch/macppc/compile/GENERIC
> real mem = 1073741824 (1024MB)
> avail mem = 1025527808 (978MB)
> random: good seed from bootblocks
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root: model PowerMac10,2
> cpu0 at mainbus0: 7447A (Revision 0x102): 1499 MHz: 512KB L2 cache
> mem0 at mainbus0
> spdmem0 at mem0: 1GB DDR SDRAM non-parity PC3200CL3.0
> memc0 at mainbus0: uni-n rev 0xd2
> "hw-clock" at memc0 not configured
> kiic0 at memc0 offset 0xf8001000
> iic0 at kiic0
> mpcpcibr0 at mainbus0 pci: uni-north
> pci0 at mpcpcibr0 bus 0
> pchb0 at pci0 dev 11 function 0 "Apple UniNorth AGP" rev 0x00
> agp at pchb0 not configured
> radeondrm0 at pci0 dev 16 function 0 "ATI Radeon 9200" rev 0x01
> drm0 at radeondrm0
> radeondrm0: irq 48
> mpcpcibr1 at mainbus0 pci: uni-north
> pci1 at mpcpcibr1 bus 0
> macobio0 at pci1 dev 23 function 0 "Apple Intrepid" rev 0x00
> openpic0 at macobio0 offset 0x4: version 0x4614 feature 3f0302 LE
> macgpio0 at macobio0 offset 0x50
> "modem-reset" at macgpio0 offset 0x1d not configured
> "modem-power" at macgpio0 offset 0x1c not configured
> macgpio1 at macgpio0 offset 0x9: irq 47
> "programmer-switch" at macgpio0 offset 0x11 not configured
> "gpio5" at macgpio0 offset 0x6f not configured
> "gpio6" at macgpio0 offset 0x70 not configured
> "extint-gpio15" at macgpio0 offset 0x67 not configured
> "escc-legacy" at macobio0 offset 0x12000 not configured
> zs0 at macobio0 offset 0x13000: irq 22,23
> zstty0 at zs0 channel 0
> zstty1 at zs0 channel 1
> aoa0 at macobio0 offset 0x1: irq 30,1,2
> "timer" at macobio0 offset 0x15000 not configured
> adb0 at macobio0 offset 0x16000
> apm0 at adb0: battery flags 0x0, 0% charged
> piic0 at adb0
> iic1 at piic0
> maxtmp0 at iic1 addr 0xc8: max6642
> kiic1 at macobio0 offset 0x18000
> iic2 at kiic1
> wdc0 at macobio0 offset 0x2 irq 24: DMA
> audio0 at aoa0
> ohci0 at pci1 dev 26 function 0 "Apple Intrepid USB" rev 0x00: irq 29, 
> version 1.0, legacy support
> ohci1 at pci1 dev 27 function 0 "NEC USB" rev 0x43: irq 63, version 1.0
> ohci2 at pci1 dev 27 function 1 "NEC USB" rev 0x43: irq 63, version 1.0
> ehci0 at pci1 dev 27 function 2 "NEC USB" rev 0x04: irq 63
> usb0 at ehci0: USB revision 2.0
> uhub0 at usb0 configuration 1 interface 0 "NEC EHCI root hub" rev 2.00/1.00 
> addr 1
> usb1 at ohci0: USB revision 1.0
> uhub1 at usb1 configuration 1 interface 0 "Apple OHCI root hub" rev 1.00/1.00 
> addr 1
> usb2 at ohci1: USB revision 1.0
> uhub2 at usb2 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 
> addr 1
> usb3 at ohci2: USB revision 1.0
> uhub3 at usb3 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 
> addr 1
> mpcpcibr2 at mainbus0 pci: uni-north
> pci2 at mpcpcibr2 bus 0
> kauaiata0 at pci2 dev 13 function 0 "Apple Intrepid ATA" rev 0x00
> wdc1 at kauaiata0 irq 39: DMA
> wd0 at wdc1 channel 0 drive 0: 
> wd0: 16-sector PIO, LBA48, 152627MB, 312581808 sectors
> wd0(wdc1:0:0): using PIO mode 4, DMA mode 2, Ultra-DMA mode 5
> "Apple UniNorth Firewire" rev 0x81 at pci2 dev 14 function 0 not configured
> gem0 at pci2 dev 15 function 0 "Apple Uni-N2 GMAC" rev 0x80: irq 41, address 
> 00:14:51:17:42:34
> bmtphy0 at gem0 phy 0: BCM5221 100baseTX PHY, rev. 4
> uhub4 at uhub3 port 1 configuration 1 interface 0 "Mitsumi Electric Hub in 
> Apple Extended USB Keyboard" rev 1.10/4.10 addr 2
> uhidev0 at uhub4 port 3 configuration 1 interface 0 "Mitsumi Electric Apple 
> Extended USB Keyboard" rev 1.10/4.10 addr 3
> uhidev0: iclass 3/1
> ukbd0 at uhidev0: 8 variable keys, 6 key codes, country code 13
> wskbd0 at ukbd0: console keyboard
> uhidev1 at 

Re: request for testing: malloc and large allocations

2022-02-05 Thread Jan Stary
On Feb 02 00:04:37, alexander.bl...@gmx.net wrote:
> On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote:
> > > Are you running with any malloc flags?
> > 
> > This bug report enabled me to find a bug that would pop up if G mode
> > is enabled.
> > 
> > New diff below. New tests appreciated.


Passed a make build on macppc (Mac Mini, dmesg below).

4288m32.43s real  3309m41.24s user   802m02.37s system
Recompiling now with the new malloc to see the difference.


Jan


[ using 1308496 bytes of bsd ELF symbol table ]
console out [ATY,RockHopper2_A] console in [keyboard], using USB
using parent ATY,RockHopper2Paren:: memaddr 9800, size 800 : consaddr 
9c008000 : ioaddr 9002, size 2: width 1920 linebytes 2048 height 1080 
depth 8
Copyright (c) 1982, 1986, 1989, 1991, 1993
The Regents of the University of California.  All rights reserved.
Copyright (c) 1995-2022 OpenBSD. All rights reserved.  https://www.OpenBSD.org

OpenBSD 7.0-current (GENERIC) #0: Mon Jan 31 13:22:27 CET 2022
h...@ppc.stare.cz:/usr/src/sys/arch/macppc/compile/GENERIC
real mem = 1073741824 (1024MB)
avail mem = 1025527808 (978MB)
random: good seed from bootblocks
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root: model PowerMac10,2
cpu0 at mainbus0: 7447A (Revision 0x102): 1499 MHz: 512KB L2 cache
mem0 at mainbus0
spdmem0 at mem0: 1GB DDR SDRAM non-parity PC3200CL3.0
memc0 at mainbus0: uni-n rev 0xd2
"hw-clock" at memc0 not configured
kiic0 at memc0 offset 0xf8001000
iic0 at kiic0
mpcpcibr0 at mainbus0 pci: uni-north
pci0 at mpcpcibr0 bus 0
pchb0 at pci0 dev 11 function 0 "Apple UniNorth AGP" rev 0x00
agp at pchb0 not configured
radeondrm0 at pci0 dev 16 function 0 "ATI Radeon 9200" rev 0x01
drm0 at radeondrm0
radeondrm0: irq 48
mpcpcibr1 at mainbus0 pci: uni-north
pci1 at mpcpcibr1 bus 0
macobio0 at pci1 dev 23 function 0 "Apple Intrepid" rev 0x00
openpic0 at macobio0 offset 0x4: version 0x4614 feature 3f0302 LE
macgpio0 at macobio0 offset 0x50
"modem-reset" at macgpio0 offset 0x1d not configured
"modem-power" at macgpio0 offset 0x1c not configured
macgpio1 at macgpio0 offset 0x9: irq 47
"programmer-switch" at macgpio0 offset 0x11 not configured
"gpio5" at macgpio0 offset 0x6f not configured
"gpio6" at macgpio0 offset 0x70 not configured
"extint-gpio15" at macgpio0 offset 0x67 not configured
"escc-legacy" at macobio0 offset 0x12000 not configured
zs0 at macobio0 offset 0x13000: irq 22,23
zstty0 at zs0 channel 0
zstty1 at zs0 channel 1
aoa0 at macobio0 offset 0x1: irq 30,1,2
"timer" at macobio0 offset 0x15000 not configured
adb0 at macobio0 offset 0x16000
apm0 at adb0: battery flags 0x0, 0% charged
piic0 at adb0
iic1 at piic0
maxtmp0 at iic1 addr 0xc8: max6642
kiic1 at macobio0 offset 0x18000
iic2 at kiic1
wdc0 at macobio0 offset 0x2 irq 24: DMA
audio0 at aoa0
ohci0 at pci1 dev 26 function 0 "Apple Intrepid USB" rev 0x00: irq 29, version 
1.0, legacy support
ohci1 at pci1 dev 27 function 0 "NEC USB" rev 0x43: irq 63, version 1.0
ohci2 at pci1 dev 27 function 1 "NEC USB" rev 0x43: irq 63, version 1.0
ehci0 at pci1 dev 27 function 2 "NEC USB" rev 0x04: irq 63
usb0 at ehci0: USB revision 2.0
uhub0 at usb0 configuration 1 interface 0 "NEC EHCI root hub" rev 2.00/1.00 
addr 1
usb1 at ohci0: USB revision 1.0
uhub1 at usb1 configuration 1 interface 0 "Apple OHCI root hub" rev 1.00/1.00 
addr 1
usb2 at ohci1: USB revision 1.0
uhub2 at usb2 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 
addr 1
usb3 at ohci2: USB revision 1.0
uhub3 at usb3 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 
addr 1
mpcpcibr2 at mainbus0 pci: uni-north
pci2 at mpcpcibr2 bus 0
kauaiata0 at pci2 dev 13 function 0 "Apple Intrepid ATA" rev 0x00
wdc1 at kauaiata0 irq 39: DMA
wd0 at wdc1 channel 0 drive 0: 
wd0: 16-sector PIO, LBA48, 152627MB, 312581808 sectors
wd0(wdc1:0:0): using PIO mode 4, DMA mode 2, Ultra-DMA mode 5
"Apple UniNorth Firewire" rev 0x81 at pci2 dev 14 function 0 not configured
gem0 at pci2 dev 15 function 0 "Apple Uni-N2 GMAC" rev 0x80: irq 41, address 
00:14:51:17:42:34
bmtphy0 at gem0 phy 0: BCM5221 100baseTX PHY, rev. 4
uhub4 at uhub3 port 1 configuration 1 interface 0 "Mitsumi Electric Hub in 
Apple Extended USB Keyboard" rev 1.10/4.10 addr 2
uhidev0 at uhub4 port 3 configuration 1 interface 0 "Mitsumi Electric Apple 
Extended USB Keyboard" rev 1.10/4.10 addr 3
uhidev0: iclass 3/1
ukbd0 at uhidev0: 8 variable keys, 6 key codes, country code 13
wskbd0 at ukbd0: console keyboard
uhidev1 at uhub4 port 3 configuration 1 interface 1 "Mitsumi Electric Apple 
Extended USB Keyboard" rev 1.10/4.10 addr 3
uhidev1: iclass 3/0, 3 report ids
uhid0 at uhidev1 reportid 2: input=1, output=0, feature=0
ucc0 at uhidev1 reportid 3: 4 usages, 4 keys, enum
wskbd1 at ucc0 mux 1
vscsi0 at root
scsibus1 at vscsi0: 256 targets
softraid0 at root
scsibus2 at softraid0: 256 targets
bootpath: /pci@f400/ata-6@d/disk@0:/bsd
root on wd0a (64c5f028a86139ec.a) swap on wd0b dump on wd0b

Re: request for testing: malloc and large allocations

2022-02-01 Thread Alexander Bluhm
On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote:
> > Are you running with any malloc flags?
> 
> This bug report enabled me to find a bug that would pop up if G mode
> is enabled.
> 
> New diff below. New tests appreciated.

It passed a full regress run on amd64 with vm.malloc_conf CFGJU

bluhm

> Index: stdlib/malloc.c
> ===
> RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
> retrieving revision 1.272
> diff -u -p -r1.272 malloc.c
> --- stdlib/malloc.c   19 Sep 2021 09:15:22 -  1.272
> +++ stdlib/malloc.c   31 Jan 2022 16:27:31 -
> @@ -113,13 +113,27 @@ struct region_info {
>  
>  LIST_HEAD(chunk_head, chunk_info);
>  
> -#define MAX_CACHEABLE_SIZE   32
> -struct cache {
> - void *pages[MALLOC_MAXCACHE];
> +/*
> + * Two caches, one for "small" regions, one for "big".
> + * Small cache is an array per size, big cache is one array with different
> + * sized regions
> + */
> +#define MAX_SMALLCACHEABLE_SIZE  32
> +#define MAX_BIGCACHEABLE_SIZE512
> +/* If the total # of pages is larger than this, evict before inserting */
> +#define BIGCACHE_FILL(sz)(MAX_BIGCACHEABLE_SIZE * (sz) / 4)
> +
> +struct smallcache {
> + void **pages;
>   ushort length;
>   ushort max;
>  };
>  
> +struct bigcache {
> + void *page;
> + size_t psize;
> +};
> +
>  struct dir_info {
>   u_int32_t canary1;
>   int active; /* status of malloc */
> @@ -139,7 +153,10 @@ struct dir_info {
>   void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1];
>   u_char rbytes[32];  /* random bytes */
>   /* free pages cache */
> - struct cache cache[MAX_CACHEABLE_SIZE];
> + struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE];
> + size_t bigcache_used;
> + size_t bigcache_size;
> + struct bigcache *bigcache;
>  #ifdef MALLOC_STATS
>   size_t inserts;
>   size_t insert_collisions;
> @@ -207,7 +224,7 @@ struct malloc_readonly {
>  #ifdef MALLOC_STATS
>   int malloc_stats;   /* dump statistics at end */
>  #endif
> - u_int32_t malloc_canary;/* Matched against ones in malloc_pool 
> */
> + u_int32_t malloc_canary;/* Matched against ones in pool */
>  };
>  
>  /* This object is mapped PROT_READ after initialisation to prevent tampering 
> */
> @@ -714,18 +731,61 @@ unmap(struct dir_info *d, void *p, size_
>   size_t psz = sz >> MALLOC_PAGESHIFT;
>   void *r;
>   u_short i;
> - struct cache *cache;
> + struct smallcache *cache;
>  
>   if (sz != PAGEROUND(sz) || psz == 0)
>   wrterror(d, "munmap round");
>  
> - if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) {
> + if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE &&
> + psz <= MAX_BIGCACHEABLE_SIZE) {
> + u_short base = getrbyte(d);
> + u_short j;
> +
> + /* don't look through all slots */
> + for (j = 0; j < d->bigcache_size / 4; j++) {
> + i = (base + j) % d->bigcache_size;
> + if (d->bigcache_used <
> + BIGCACHE_FILL(d->bigcache_size))  {
> + if (d->bigcache[i].psize == 0)
> + break;
> + } else {
> + if (d->bigcache[i].psize != 0)
> + break;
> + }
> + }
> + /* if we didn't find a preferred slot, use random one */
> + if (d->bigcache[i].psize != 0) {
> + size_t tmp;
> +
> + r = d->bigcache[i].page;
> + d->bigcache_used -= d->bigcache[i].psize;
> + tmp = d->bigcache[i].psize << MALLOC_PAGESHIFT;
> + if (!mopts.malloc_freeunmap)
> + validate_junk(d, r, tmp);
> + if (munmap(r, tmp))
> +  wrterror(d, "munmap %p", r);
> + STATS_SUB(d->malloc_used, tmp);
> + }
> + 
> + if (clear > 0)
> + explicit_bzero(p, clear);
> + if (mopts.malloc_freeunmap) {
> + if (mprotect(p, sz, PROT_NONE))
> + wrterror(d, "mprotect %p", r);
> + } else
> + junk_free(d->malloc_junk, p, sz);
> + d->bigcache[i].page = p;
> + d->bigcache[i].psize = psz;
> + d->bigcache_used += psz;
> + return;
> + }
> + if (psz > MAX_SMALLCACHEABLE_SIZE || d->smallcache[psz - 1].max == 0) {
>   if (munmap(p, sz))
>   wrterror(d, "munmap %p", p);
>   STATS_SUB(d->malloc_used, sz);
>   return;
>   }
> - cache = >cache[psz - 1];
> + cache = 

Re: request for testing: malloc and large allocations

2022-01-31 Thread Otto Moerbeek
On Fri, Jan 28, 2022 at 05:17:48PM +0100, Otto Moerbeek wrote:

> On Fri, Jan 28, 2022 at 04:33:28PM +0100, Alexander Bluhm wrote:
> 
> > On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote:
> > > currently malloc does cache a number of free'ed regions up to 128k in
> > > size. This cache is indexed by size (in # of pages), so it is very
> > > quick to check.
> > >
> > > Some programs allocate and deallocate larger allocations in a frantic
> > > way.  Accodomate those programs by also keeping a cache of regions
> > > betwen 128k and 2M, in a cache of variable sized regions.
> > >
> > > My test case speeds up about twice. A make build gets a small speedup.
> > >
> > > This has been tested by myself on amd64 quite intensively. I am asking
> > > for more tests, especialy on more "exotic" platforms. I wil do arm64
> > > myself soon.  Test can be running your favorite programs, doing make
> > > builds or running tests in regress/lib/libc/malloc.
> > 
> > I see openssl and tmux crash with this diff.
> > /usr/src/regress/usr.sbin/openssl reproduces it on arm64, amd64,
> > i386.
> 
> Are you running with any malloc flags?

This bug report enabled me to find a bug that would pop up if G mode
is enabled.

New diff below. New tests appreciated.

-Otto

Index: stdlib/malloc.c
===
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.272
diff -u -p -r1.272 malloc.c
--- stdlib/malloc.c 19 Sep 2021 09:15:22 -  1.272
+++ stdlib/malloc.c 31 Jan 2022 16:27:31 -
@@ -113,13 +113,27 @@ struct region_info {
 
 LIST_HEAD(chunk_head, chunk_info);
 
-#define MAX_CACHEABLE_SIZE 32
-struct cache {
-   void *pages[MALLOC_MAXCACHE];
+/*
+ * Two caches, one for "small" regions, one for "big".
+ * Small cache is an array per size, big cache is one array with different
+ * sized regions
+ */
+#define MAX_SMALLCACHEABLE_SIZE32
+#define MAX_BIGCACHEABLE_SIZE  512
+/* If the total # of pages is larger than this, evict before inserting */
+#define BIGCACHE_FILL(sz)  (MAX_BIGCACHEABLE_SIZE * (sz) / 4)
+
+struct smallcache {
+   void **pages;
ushort length;
ushort max;
 };
 
+struct bigcache {
+   void *page;
+   size_t psize;
+};
+
 struct dir_info {
u_int32_t canary1;
int active; /* status of malloc */
@@ -139,7 +153,10 @@ struct dir_info {
void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1];
u_char rbytes[32];  /* random bytes */
/* free pages cache */
-   struct cache cache[MAX_CACHEABLE_SIZE];
+   struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE];
+   size_t bigcache_used;
+   size_t bigcache_size;
+   struct bigcache *bigcache;
 #ifdef MALLOC_STATS
size_t inserts;
size_t insert_collisions;
@@ -207,7 +224,7 @@ struct malloc_readonly {
 #ifdef MALLOC_STATS
int malloc_stats;   /* dump statistics at end */
 #endif
-   u_int32_t malloc_canary;/* Matched against ones in malloc_pool 
*/
+   u_int32_t malloc_canary;/* Matched against ones in pool */
 };
 
 /* This object is mapped PROT_READ after initialisation to prevent tampering */
@@ -714,18 +731,61 @@ unmap(struct dir_info *d, void *p, size_
size_t psz = sz >> MALLOC_PAGESHIFT;
void *r;
u_short i;
-   struct cache *cache;
+   struct smallcache *cache;
 
if (sz != PAGEROUND(sz) || psz == 0)
wrterror(d, "munmap round");
 
-   if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) {
+   if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE &&
+   psz <= MAX_BIGCACHEABLE_SIZE) {
+   u_short base = getrbyte(d);
+   u_short j;
+
+   /* don't look through all slots */
+   for (j = 0; j < d->bigcache_size / 4; j++) {
+   i = (base + j) % d->bigcache_size;
+   if (d->bigcache_used <
+   BIGCACHE_FILL(d->bigcache_size))  {
+   if (d->bigcache[i].psize == 0)
+   break;
+   } else {
+   if (d->bigcache[i].psize != 0)
+   break;
+   }
+   }
+   /* if we didn't find a preferred slot, use random one */
+   if (d->bigcache[i].psize != 0) {
+   size_t tmp;
+
+   r = d->bigcache[i].page;
+   d->bigcache_used -= d->bigcache[i].psize;
+   tmp = d->bigcache[i].psize << MALLOC_PAGESHIFT;
+   if (!mopts.malloc_freeunmap)
+   validate_junk(d, r, tmp);
+   if (munmap(r, tmp))
+wrterror(d, "munmap %p", 

Re: request for testing: malloc and large allocations

2022-01-28 Thread Otto Moerbeek
On Fri, Jan 28, 2022 at 04:33:28PM +0100, Alexander Bluhm wrote:

> On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote:
> > currently malloc does cache a number of free'ed regions up to 128k in
> > size. This cache is indexed by size (in # of pages), so it is very
> > quick to check.
> >
> > Some programs allocate and deallocate larger allocations in a frantic
> > way.  Accodomate those programs by also keeping a cache of regions
> > betwen 128k and 2M, in a cache of variable sized regions.
> >
> > My test case speeds up about twice. A make build gets a small speedup.
> >
> > This has been tested by myself on amd64 quite intensively. I am asking
> > for more tests, especialy on more "exotic" platforms. I wil do arm64
> > myself soon.  Test can be running your favorite programs, doing make
> > builds or running tests in regress/lib/libc/malloc.
> 
> I see openssl and tmux crash with this diff.
> /usr/src/regress/usr.sbin/openssl reproduces it on arm64, amd64,
> i386.

Are you running with any malloc flags?

-Otto

> 
> #0  L1 () at /usr/src/lib/libc/arch/amd64/string/memset.S:52
> 52  L1: rep
> (gdb) bt
> #0  L1 () at /usr/src/lib/libc/arch/amd64/string/memset.S:52
> #1  0x0fde4ed058d8 in _libc_explicit_bzero (buf=0xfded1f8e000, 
> len=18446744073709549600) at /usr/src/lib/libc/string/explicit_bzero.c:17
> #2  0x0fde4ed6f84f in unmap (d=0xfdead893830, p=Variable "p" is not 
> available.
> ) at /usr/src/lib/libc/stdlib/malloc.c:805
> #3  0x0fde4ed6ceca in ofree (argpool=0x7f7c2268, p=Variable "p" is 
> not available.
> ) at /usr/src/lib/libc/stdlib/malloc.c:1511
> #4  0x0fde4ed6e4cb in _libc_recallocarray (ptr=0xfded1f8e7e0, 
> oldnmemb=Variable "oldnmemb" is not available.
> ) at /usr/src/lib/libc/stdlib/malloc.c:1908
> #5  0x0fdbd4787afe in xrecallocarray (ptr=Unhandled dwarf expression 
> opcode 0xa3
> ) at /usr/src/usr.bin/tmux/xmalloc.c:81
> #6  0x0fdbd4703ce6 in cmd_parse_build_commands (cmds=Unhandled dwarf 
> expression opcode 0xa3
> ) at cmd-parse.y:815
> #7  0x0fdbd4703d47 in cmd_parse_build_commands (cmds=Unhandled dwarf 
> expression opcode 0xa3
> ) at cmd-parse.y:823
> #8  0x0fdbd47040f7 in cmd_parse_from_buffer (buf=Unhandled dwarf 
> expression opcode 0xa3
> ) at cmd-parse.y:1036
> #9  0x0fdbd4703ffd in cmd_parse_from_string (
> s=0xfdbd46d6522 "bind > { display-menu -xP -yP -T 
> '#[align=centre]#{pane_index} (#{pane_id})'  
> '#{?#{m/r:(copy|view)-mode,#{pane_mode}},Go To Top,}' '<' {send -X 
> history-top} '#{?#{m/r:(copy|view)-mode,#{pane_mode}},G"..., 
> pi=0x7f7c24a0) at cmd-parse.y:959
> #10 0x0fdbd473506e in key_bindings_init () at 
> /usr/src/usr.bin/tmux/key-bindings.c:636
> #11 0x0fdbd47564c8 in server_start (client=0xfdec1056000, 
> flags=402653184, base=0xfdec103f400, lockfd=5, lockfile=0xfdec1041b80 
> "/tmp/tmux-0/default.lock")
> at /usr/src/usr.bin/tmux/server.c:210
> #12 0x0fdbd46f8788 in client_main (base=0xfdec103f400, argc=Unhandled 
> dwarf expression opcode 0xa3
> ) at /usr/src/usr.bin/tmux/client.c:165
> #13 0x0fdbd4761b62 in main (argc=0, argv=0x7f7c2880) at 
> /usr/src/usr.bin/tmux/tmux.c:529
> 
> bluhm



Re: request for testing: malloc and large allocations

2022-01-28 Thread Alexander Bluhm
On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote:
> currently malloc does cache a number of free'ed regions up to 128k in
> size. This cache is indexed by size (in # of pages), so it is very
> quick to check.
> 
> Some programs allocate and deallocate larger allocations in a frantic
> way.  Accodomate those programs by also keeping a cache of regions
> betwen 128k and 2M, in a cache of variable sized regions.
> 
> My test case speeds up about twice. A make build gets a small speedup.
> 
> This has been tested by myself on amd64 quite intensively. I am asking
> for more tests, especialy on more "exotic" platforms. I wil do arm64
> myself soon.  Test can be running your favorite programs, doing make
> builds or running tests in regress/lib/libc/malloc.

I see openssl and tmux crash with this diff.
/usr/src/regress/usr.sbin/openssl reproduces it on arm64, amd64,
i386.

#0  L1 () at /usr/src/lib/libc/arch/amd64/string/memset.S:52
52  L1: rep
(gdb) bt
#0  L1 () at /usr/src/lib/libc/arch/amd64/string/memset.S:52
#1  0x0fde4ed058d8 in _libc_explicit_bzero (buf=0xfded1f8e000, 
len=18446744073709549600) at /usr/src/lib/libc/string/explicit_bzero.c:17
#2  0x0fde4ed6f84f in unmap (d=0xfdead893830, p=Variable "p" is not 
available.
) at /usr/src/lib/libc/stdlib/malloc.c:805
#3  0x0fde4ed6ceca in ofree (argpool=0x7f7c2268, p=Variable "p" is not 
available.
) at /usr/src/lib/libc/stdlib/malloc.c:1511
#4  0x0fde4ed6e4cb in _libc_recallocarray (ptr=0xfded1f8e7e0, 
oldnmemb=Variable "oldnmemb" is not available.
) at /usr/src/lib/libc/stdlib/malloc.c:1908
#5  0x0fdbd4787afe in xrecallocarray (ptr=Unhandled dwarf expression opcode 
0xa3
) at /usr/src/usr.bin/tmux/xmalloc.c:81
#6  0x0fdbd4703ce6 in cmd_parse_build_commands (cmds=Unhandled dwarf 
expression opcode 0xa3
) at cmd-parse.y:815
#7  0x0fdbd4703d47 in cmd_parse_build_commands (cmds=Unhandled dwarf 
expression opcode 0xa3
) at cmd-parse.y:823
#8  0x0fdbd47040f7 in cmd_parse_from_buffer (buf=Unhandled dwarf expression 
opcode 0xa3
) at cmd-parse.y:1036
#9  0x0fdbd4703ffd in cmd_parse_from_string (
s=0xfdbd46d6522 "bind > { display-menu -xP -yP -T 
'#[align=centre]#{pane_index} (#{pane_id})'  
'#{?#{m/r:(copy|view)-mode,#{pane_mode}},Go To Top,}' '<' {send -X history-top} 
'#{?#{m/r:(copy|view)-mode,#{pane_mode}},G"..., pi=0x7f7c24a0) at 
cmd-parse.y:959
#10 0x0fdbd473506e in key_bindings_init () at 
/usr/src/usr.bin/tmux/key-bindings.c:636
#11 0x0fdbd47564c8 in server_start (client=0xfdec1056000, flags=402653184, 
base=0xfdec103f400, lockfd=5, lockfile=0xfdec1041b80 "/tmp/tmux-0/default.lock")
at /usr/src/usr.bin/tmux/server.c:210
#12 0x0fdbd46f8788 in client_main (base=0xfdec103f400, argc=Unhandled dwarf 
expression opcode 0xa3
) at /usr/src/usr.bin/tmux/client.c:165
#13 0x0fdbd4761b62 in main (argc=0, argv=0x7f7c2880) at 
/usr/src/usr.bin/tmux/tmux.c:529

bluhm



Re: request for testing: malloc and large allocations

2022-01-25 Thread Otto Moerbeek
On Sat, Jan 22, 2022 at 09:25:25AM +0100, Otto Moerbeek wrote:

> On Mon, Jan 17, 2022 at 08:42:47AM +0100, Otto Moerbeek wrote:
> 
> > On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote:
> > 
> > > Hi,
> > > 
> > > currently malloc does cache a number of free'ed regions up to 128k in
> > > size. This cache is indexed by size (in # of pages), so it is very
> > > quick to check.
> > > 
> > > Some programs allocate and deallocate larger allocations in a frantic
> > > way.  Accodomate those programs by also keeping a cache of regions
> > > betwen 128k and 2M, in a cache of variable sized regions.
> > > 
> > > My test case speeds up about twice. A make build gets a small speedup.
> > > 
> > > This has been tested by myself on amd64 quite intensively. I am asking
> > > for more tests, especialy on more "exotic" platforms. I wil do arm64
> > > myself soon.  Test can be running your favorite programs, doing make
> > > builds or running tests in regress/lib/libc/malloc.
> > > 
> > > Thanks in advance!
> > 
> > 
> > I received several success reports and one failure on macppc report
> > from Miod. I'm investiging that report to see if this diff can be
> > blamed.
> > 
> > In the meantime: keep on testing!
> > 
> > -Otto
> 
> So far I have been unable to reproduce. I would like confirmation
> either way from somebody else having a macppc machine. Any volunteer?
> 
> Thanks,
> 
>   -Otto

It turns out the macppc troubles have ben resolved by an uvm commit.

-Otto



Re: request for testing: malloc and large allocations

2022-01-22 Thread Otto Moerbeek
On Mon, Jan 17, 2022 at 08:42:47AM +0100, Otto Moerbeek wrote:

> On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote:
> 
> > Hi,
> > 
> > currently malloc does cache a number of free'ed regions up to 128k in
> > size. This cache is indexed by size (in # of pages), so it is very
> > quick to check.
> > 
> > Some programs allocate and deallocate larger allocations in a frantic
> > way.  Accodomate those programs by also keeping a cache of regions
> > betwen 128k and 2M, in a cache of variable sized regions.
> > 
> > My test case speeds up about twice. A make build gets a small speedup.
> > 
> > This has been tested by myself on amd64 quite intensively. I am asking
> > for more tests, especialy on more "exotic" platforms. I wil do arm64
> > myself soon.  Test can be running your favorite programs, doing make
> > builds or running tests in regress/lib/libc/malloc.
> > 
> > Thanks in advance!
> 
> 
> I received several success reports and one failure on macppc report
> from Miod. I'm investiging that report to see if this diff can be
> blamed.
> 
> In the meantime: keep on testing!
> 
>   -Otto

So far I have been unable to reproduce. I would like confirmation
either way from somebody else having a macppc machine. Any volunteer?

Thanks,

-Otto



Re: request for testing: malloc and large allocations

2022-01-16 Thread Otto Moerbeek
On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote:

> Hi,
> 
> currently malloc does cache a number of free'ed regions up to 128k in
> size. This cache is indexed by size (in # of pages), so it is very
> quick to check.
> 
> Some programs allocate and deallocate larger allocations in a frantic
> way.  Accodomate those programs by also keeping a cache of regions
> betwen 128k and 2M, in a cache of variable sized regions.
> 
> My test case speeds up about twice. A make build gets a small speedup.
> 
> This has been tested by myself on amd64 quite intensively. I am asking
> for more tests, especialy on more "exotic" platforms. I wil do arm64
> myself soon.  Test can be running your favorite programs, doing make
> builds or running tests in regress/lib/libc/malloc.
> 
> Thanks in advance!


I received several success reports and one failure on macppc report
from Miod. I'm investiging that report to see if this diff can be
blamed.

In the meantime: keep on testing!

-Otto



Re: request for testing: malloc and large allocations

2022-01-13 Thread Leo Unglaub

Hey,

On 1/9/22 14:54, Otto Moerbeek wrote:

currently malloc does cache a number of free'ed regions up to 128k in
size. This cache is indexed by size (in # of pages), so it is very
quick to check.

Some programs allocate and deallocate larger allocations in a frantic
way.  Accodomate those programs by also keeping a cache of regions
betwen 128k and 2M, in a cache of variable sized regions.

My test case speeds up about twice. A make build gets a small speedup.

This has been tested by myself on amd64 quite intensively. I am asking
for more tests, especialy on more "exotic" platforms. I wil do arm64
myself soon.  Test can be running your favorite programs, doing make
builds or running tests in regress/lib/libc/malloc.

Thanks in advance!


i did some workloads with your patch applied. I did not notice any 
speedups (but i also did not time it, just a suggestive feeling while 
waiting for cargo build to finish) To me everything works fine. I am on 
a normal amd64, sadly i dont have other hardware available.


I hope this helps
Greetings
Leo



Re: request for testing: malloc and large allocations

2022-01-13 Thread Matthias Schmidt
Hi Otto,

* Otto Moerbeek wrote:
> Hi,
> 
> currently malloc does cache a number of free'ed regions up to 128k in
> size. This cache is indexed by size (in # of pages), so it is very
> quick to check.
> 
> Some programs allocate and deallocate larger allocations in a frantic
> way.  Accodomate those programs by also keeping a cache of regions
> betwen 128k and 2M, in a cache of variable sized regions.
> 
> My test case speeds up about twice. A make build gets a small speedup.
> 
> This has been tested by myself on amd64 quite intensively. I am asking
> for more tests, especialy on more "exotic" platforms. I wil do arm64
> myself soon.  Test can be running your favorite programs, doing make
> builds or running tests in regress/lib/libc/malloc.

I have your patch running on an amd64 Thinkpad T450s with usual desktop
usage and noticed no regression so far.

Cheers

Matthias



Re: request for testing: malloc and large allocations

2022-01-13 Thread Stuart Henderson
On 2022/01/09 14:54, Otto Moerbeek wrote:
> currently malloc does cache a number of free'ed regions up to 128k in
> size. This cache is indexed by size (in # of pages), so it is very
> quick to check.
> 
> Some programs allocate and deallocate larger allocations in a frantic
> way.  Accodomate those programs by also keeping a cache of regions
> betwen 128k and 2M, in a cache of variable sized regions.
> 
> My test case speeds up about twice. A make build gets a small speedup.
> 
> This has been tested by myself on amd64 quite intensively. I am asking
> for more tests, especialy on more "exotic" platforms. I wil do arm64
> myself soon.  Test can be running your favorite programs, doing make
> builds or running tests in regress/lib/libc/malloc.

This has been through mkr and ports bulk build on i386.
Ports build times vary too much to say if it's faster there but
no issues have been seen.

> Thanks in advance!
> 
>   -Otto
> 
> Index: stdlib/malloc.c
> ===
> RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
> retrieving revision 1.272
> diff -u -p -r1.272 malloc.c
> --- stdlib/malloc.c   19 Sep 2021 09:15:22 -  1.272
> +++ stdlib/malloc.c   9 Jan 2022 13:10:35 -
> @@ -113,13 +113,28 @@ struct region_info {
>  
>  LIST_HEAD(chunk_head, chunk_info);
>  
> -#define MAX_CACHEABLE_SIZE   32
> -struct cache {
> - void *pages[MALLOC_MAXCACHE];
> +/*
> + * Two caches, one for "small" regions, one for "big".
> + * Small cacche is an array per size, big cache is one array with different
> + * sizes regions
> + */
> +#define MAX_SMALLCACHEABLE_SIZE  32
> +#define MAX_BIGCACHEABLE_SIZE512
> +#define BIGCACHE_SIZEMALLOC_MAXCACHE
> +/* If the total # of pages is larger than this, evict before inserting */
> +#define BIGCACHE_FILL(sz)(MAX_BIGCACHEABLE_SIZE * (sz) / 4)
> +
> +struct smallcache {
> + void **pages;
>   ushort length;
>   ushort max;
>  };
>  
> +struct bigcache {
> + void *page;
> + size_t psize;
> +};
> +
>  struct dir_info {
>   u_int32_t canary1;
>   int active; /* status of malloc */
> @@ -139,7 +154,10 @@ struct dir_info {
>   void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1];
>   u_char rbytes[32];  /* random bytes */
>   /* free pages cache */
> - struct cache cache[MAX_CACHEABLE_SIZE];
> + struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE];
> + ushort bigcache_size;
> + size_t bigcache_used;
> + struct bigcache bigcache[BIGCACHE_SIZE];
>  #ifdef MALLOC_STATS
>   size_t inserts;
>   size_t insert_collisions;
> @@ -714,18 +732,61 @@ unmap(struct dir_info *d, void *p, size_
>   size_t psz = sz >> MALLOC_PAGESHIFT;
>   void *r;
>   u_short i;
> - struct cache *cache;
> + struct smallcache *cache;
>  
>   if (sz != PAGEROUND(sz) || psz == 0)
>   wrterror(d, "munmap round");
>  
> - if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) {
> + if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE &&
> + psz <= MAX_BIGCACHEABLE_SIZE) {
> + u_short base = getrbyte(d);
> + u_short j;
> +
> + /* don't look through all slots */
> + for (j = 0; j < d->bigcache_size / 4; j++) {
> + i = (base + j) % d->bigcache_size;
> + if (d->bigcache_used <
> + BIGCACHE_FILL(d->bigcache_size))  {
> + if (d->bigcache[i].psize == 0)
> + break;
> + } else {
> + if (d->bigcache[i].psize != 0)
> + break;
> + }
> + }
> + /* if we didn't find a preferred slot, use random one */
> + if (d->bigcache[i].psize != 0) {
> + size_t tmp;
> +
> + r = d->bigcache[i].page;
> + d->bigcache_used -= d->bigcache[i].psize;
> + tmp = d->bigcache[i].psize << MALLOC_PAGESHIFT;
> + if (!mopts.malloc_freeunmap)
> + validate_junk(d, r, tmp);
> + if (munmap(r, tmp))
> +  wrterror(d, "munmap %p", r);
> + STATS_SUB(d->malloc_used, tmp);
> + }
> + 
> + if (clear > 0)
> + explicit_bzero(p, clear);
> + if (mopts.malloc_freeunmap) {
> + if (mprotect(p, sz, PROT_NONE))
> + wrterror(d, "mprotect %p", r);
> + } else
> + junk_free(d->malloc_junk, p, sz);
> + d->bigcache[i].page = p;
> + d->bigcache[i].psize = psz;
> + d->bigcache_used += psz;
> + return;
> + }
> +   

request for testing: malloc and large allocations

2022-01-09 Thread Otto Moerbeek
Hi,

currently malloc does cache a number of free'ed regions up to 128k in
size. This cache is indexed by size (in # of pages), so it is very
quick to check.

Some programs allocate and deallocate larger allocations in a frantic
way.  Accodomate those programs by also keeping a cache of regions
betwen 128k and 2M, in a cache of variable sized regions.

My test case speeds up about twice. A make build gets a small speedup.

This has been tested by myself on amd64 quite intensively. I am asking
for more tests, especialy on more "exotic" platforms. I wil do arm64
myself soon.  Test can be running your favorite programs, doing make
builds or running tests in regress/lib/libc/malloc.

Thanks in advance!

-Otto

Index: stdlib/malloc.c
===
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.272
diff -u -p -r1.272 malloc.c
--- stdlib/malloc.c 19 Sep 2021 09:15:22 -  1.272
+++ stdlib/malloc.c 9 Jan 2022 13:10:35 -
@@ -113,13 +113,28 @@ struct region_info {
 
 LIST_HEAD(chunk_head, chunk_info);
 
-#define MAX_CACHEABLE_SIZE 32
-struct cache {
-   void *pages[MALLOC_MAXCACHE];
+/*
+ * Two caches, one for "small" regions, one for "big".
+ * Small cacche is an array per size, big cache is one array with different
+ * sizes regions
+ */
+#define MAX_SMALLCACHEABLE_SIZE32
+#define MAX_BIGCACHEABLE_SIZE  512
+#define BIGCACHE_SIZE  MALLOC_MAXCACHE
+/* If the total # of pages is larger than this, evict before inserting */
+#define BIGCACHE_FILL(sz)  (MAX_BIGCACHEABLE_SIZE * (sz) / 4)
+
+struct smallcache {
+   void **pages;
ushort length;
ushort max;
 };
 
+struct bigcache {
+   void *page;
+   size_t psize;
+};
+
 struct dir_info {
u_int32_t canary1;
int active; /* status of malloc */
@@ -139,7 +154,10 @@ struct dir_info {
void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1];
u_char rbytes[32];  /* random bytes */
/* free pages cache */
-   struct cache cache[MAX_CACHEABLE_SIZE];
+   struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE];
+   ushort bigcache_size;
+   size_t bigcache_used;
+   struct bigcache bigcache[BIGCACHE_SIZE];
 #ifdef MALLOC_STATS
size_t inserts;
size_t insert_collisions;
@@ -714,18 +732,61 @@ unmap(struct dir_info *d, void *p, size_
size_t psz = sz >> MALLOC_PAGESHIFT;
void *r;
u_short i;
-   struct cache *cache;
+   struct smallcache *cache;
 
if (sz != PAGEROUND(sz) || psz == 0)
wrterror(d, "munmap round");
 
-   if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) {
+   if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE &&
+   psz <= MAX_BIGCACHEABLE_SIZE) {
+   u_short base = getrbyte(d);
+   u_short j;
+
+   /* don't look through all slots */
+   for (j = 0; j < d->bigcache_size / 4; j++) {
+   i = (base + j) % d->bigcache_size;
+   if (d->bigcache_used <
+   BIGCACHE_FILL(d->bigcache_size))  {
+   if (d->bigcache[i].psize == 0)
+   break;
+   } else {
+   if (d->bigcache[i].psize != 0)
+   break;
+   }
+   }
+   /* if we didn't find a preferred slot, use random one */
+   if (d->bigcache[i].psize != 0) {
+   size_t tmp;
+
+   r = d->bigcache[i].page;
+   d->bigcache_used -= d->bigcache[i].psize;
+   tmp = d->bigcache[i].psize << MALLOC_PAGESHIFT;
+   if (!mopts.malloc_freeunmap)
+   validate_junk(d, r, tmp);
+   if (munmap(r, tmp))
+wrterror(d, "munmap %p", r);
+   STATS_SUB(d->malloc_used, tmp);
+   }
+   
+   if (clear > 0)
+   explicit_bzero(p, clear);
+   if (mopts.malloc_freeunmap) {
+   if (mprotect(p, sz, PROT_NONE))
+   wrterror(d, "mprotect %p", r);
+   } else
+   junk_free(d->malloc_junk, p, sz);
+   d->bigcache[i].page = p;
+   d->bigcache[i].psize = psz;
+   d->bigcache_used += psz;
+   return;
+   }
+   if (psz > MAX_SMALLCACHEABLE_SIZE || d->smallcache[psz - 1].max == 0) {
if (munmap(p, sz))
wrterror(d, "munmap %p", p);
STATS_SUB(d->malloc_used, sz);
return;
}
-   cache = >cache[psz - 1];
+   cache =