Re: request for testing: malloc and large allocations
On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote: > On Fri, Jan 28, 2022 at 05:17:48PM +0100, Otto Moerbeek wrote: > > > On Fri, Jan 28, 2022 at 04:33:28PM +0100, Alexander Bluhm wrote: > > > > > On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote: > > > > currently malloc does cache a number of free'ed regions up to 128k in > > > > size. This cache is indexed by size (in # of pages), so it is very > > > > quick to check. > > > > > > > > Some programs allocate and deallocate larger allocations in a frantic > > > > way. Accodomate those programs by also keeping a cache of regions > > > > betwen 128k and 2M, in a cache of variable sized regions. > > > > > > > > My test case speeds up about twice. A make build gets a small speedup. > > > > > > > > This has been tested by myself on amd64 quite intensively. I am asking > > > > for more tests, especialy on more "exotic" platforms. I wil do arm64 > > > > myself soon. Test can be running your favorite programs, doing make > > > > builds or running tests in regress/lib/libc/malloc. > > > > > > I see openssl and tmux crash with this diff. > > > /usr/src/regress/usr.sbin/openssl reproduces it on arm64, amd64, > > > i386. > > > > Are you running with any malloc flags? > > This bug report enabled me to find a bug that would pop up if G mode > is enabled. > > New diff below. New tests appreciated. This has been in snaps for a while. Any body willing to review and OK? -Otto > Index: stdlib/malloc.c > === > RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v > retrieving revision 1.272 > diff -u -p -r1.272 malloc.c > --- stdlib/malloc.c 19 Sep 2021 09:15:22 - 1.272 > +++ stdlib/malloc.c 31 Jan 2022 16:27:31 - > @@ -113,13 +113,27 @@ struct region_info { > > LIST_HEAD(chunk_head, chunk_info); > > -#define MAX_CACHEABLE_SIZE 32 > -struct cache { > - void *pages[MALLOC_MAXCACHE]; > +/* > + * Two caches, one for "small" regions, one for "big". > + * Small cache is an array per size, big cache is one array with different > + * sized regions > + */ > +#define MAX_SMALLCACHEABLE_SIZE 32 > +#define MAX_BIGCACHEABLE_SIZE512 > +/* If the total # of pages is larger than this, evict before inserting */ > +#define BIGCACHE_FILL(sz)(MAX_BIGCACHEABLE_SIZE * (sz) / 4) > + > +struct smallcache { > + void **pages; > ushort length; > ushort max; > }; > > +struct bigcache { > + void *page; > + size_t psize; > +}; > + > struct dir_info { > u_int32_t canary1; > int active; /* status of malloc */ > @@ -139,7 +153,10 @@ struct dir_info { > void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1]; > u_char rbytes[32]; /* random bytes */ > /* free pages cache */ > - struct cache cache[MAX_CACHEABLE_SIZE]; > + struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE]; > + size_t bigcache_used; > + size_t bigcache_size; > + struct bigcache *bigcache; > #ifdef MALLOC_STATS > size_t inserts; > size_t insert_collisions; > @@ -207,7 +224,7 @@ struct malloc_readonly { > #ifdef MALLOC_STATS > int malloc_stats; /* dump statistics at end */ > #endif > - u_int32_t malloc_canary;/* Matched against ones in malloc_pool > */ > + u_int32_t malloc_canary;/* Matched against ones in pool */ > }; > > /* This object is mapped PROT_READ after initialisation to prevent tampering > */ > @@ -714,18 +731,61 @@ unmap(struct dir_info *d, void *p, size_ > size_t psz = sz >> MALLOC_PAGESHIFT; > void *r; > u_short i; > - struct cache *cache; > + struct smallcache *cache; > > if (sz != PAGEROUND(sz) || psz == 0) > wrterror(d, "munmap round"); > > - if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) { > + if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE && > + psz <= MAX_BIGCACHEABLE_SIZE) { > + u_short base = getrbyte(d); > + u_short j; > + > + /* don't look through all slots */ > + for (j = 0; j < d->bigcache_size / 4; j++) { > + i = (base + j) % d->bigcache_size; > + if (d->bigcache_used < > + BIGCACHE_FILL(d->bigcache_size)) { > + if (d->bigcache[i].psize == 0) > + break; > + } else { > + if (d->bigcache[i].psize != 0) > + break; > + } > + } > + /* if we didn't find a preferred slot, use random one */ > + if (d->bigcache[i].psize != 0) { > + size_t tmp; > + > + r = d->bigcache[i].page; > + d->bigcache_used -= d->bigcache[i].psize; >
Re: request for testing: malloc and large allocations
On Sat, Feb 05, 2022 at 08:07:42PM +0100, Jan Stary wrote: > On Feb 05 17:35:46, o...@drijf.net wrote: > > On Sat, Feb 05, 2022 at 05:22:50PM +0100, Jan Stary wrote: > > > > > On Feb 02 00:04:37, alexander.bl...@gmx.net wrote: > > > > On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote: > > > > > > Are you running with any malloc flags? > > > > > > > > > > This bug report enabled me to find a bug that would pop up if G mode > > > > > is enabled. > > > > > > > > > > New diff below. New tests appreciated. > > > > > > > > > Passed a make build on macppc (Mac Mini, dmesg below). > > > > > > 4288m32.43s real 3309m41.24s user 802m02.37s system > > > Recompiling now with the new malloc to see the difference. > > > > Note that during a make build, libs get installed after building them > > so all dynamically linked programs will use the new malloc from that > > point, as there is no revision bump. > > Does that mean that both passes of compiling the compiler > (i.e., compiling clang with the current clang, > and then compiling clang with the new clang, > if I remember correctly) already use the new libc? > > (Compiling clang has taken most of the ~3 days of make build.) Clang does a single pass afaik, it does not do a full bootstrap. That means clang will be compiled with the currently installed (old) clang, but that clang will use the newly built and installed libc. -Otto
Re: request for testing: malloc and large allocations
On Feb 05 17:35:46, o...@drijf.net wrote: > On Sat, Feb 05, 2022 at 05:22:50PM +0100, Jan Stary wrote: > > > On Feb 02 00:04:37, alexander.bl...@gmx.net wrote: > > > On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote: > > > > > Are you running with any malloc flags? > > > > > > > > This bug report enabled me to find a bug that would pop up if G mode > > > > is enabled. > > > > > > > > New diff below. New tests appreciated. > > > > > > Passed a make build on macppc (Mac Mini, dmesg below). > > > > 4288m32.43s real 3309m41.24s user 802m02.37s system > > Recompiling now with the new malloc to see the difference. > > Note that during a make build, libs get installed after building them > so all dynamically linked programs will use the new malloc from that > point, as there is no revision bump. Does that mean that both passes of compiling the compiler (i.e., compiling clang with the current clang, and then compiling clang with the new clang, if I remember correctly) already use the new libc? (Compiling clang has taken most of the ~3 days of make build.) Jan > > [ using 1308496 bytes of bsd ELF symbol table ] > > console out [ATY,RockHopper2_A] console in [keyboard], using USB > > using parent ATY,RockHopper2Paren:: memaddr 9800, size 800 : > > consaddr 9c008000 : ioaddr 9002, size 2: width 1920 linebytes 2048 > > height 1080 depth 8 > > Copyright (c) 1982, 1986, 1989, 1991, 1993 > > The Regents of the University of California. All rights reserved. > > Copyright (c) 1995-2022 OpenBSD. All rights reserved. > > https://www.OpenBSD.org > > > > OpenBSD 7.0-current (GENERIC) #0: Mon Jan 31 13:22:27 CET 2022 > > h...@ppc.stare.cz:/usr/src/sys/arch/macppc/compile/GENERIC > > real mem = 1073741824 (1024MB) > > avail mem = 1025527808 (978MB) > > random: good seed from bootblocks > > mpath0 at root > > scsibus0 at mpath0: 256 targets > > mainbus0 at root: model PowerMac10,2 > > cpu0 at mainbus0: 7447A (Revision 0x102): 1499 MHz: 512KB L2 cache > > mem0 at mainbus0 > > spdmem0 at mem0: 1GB DDR SDRAM non-parity PC3200CL3.0 > > memc0 at mainbus0: uni-n rev 0xd2 > > "hw-clock" at memc0 not configured > > kiic0 at memc0 offset 0xf8001000 > > iic0 at kiic0 > > mpcpcibr0 at mainbus0 pci: uni-north > > pci0 at mpcpcibr0 bus 0 > > pchb0 at pci0 dev 11 function 0 "Apple UniNorth AGP" rev 0x00 > > agp at pchb0 not configured > > radeondrm0 at pci0 dev 16 function 0 "ATI Radeon 9200" rev 0x01 > > drm0 at radeondrm0 > > radeondrm0: irq 48 > > mpcpcibr1 at mainbus0 pci: uni-north > > pci1 at mpcpcibr1 bus 0 > > macobio0 at pci1 dev 23 function 0 "Apple Intrepid" rev 0x00 > > openpic0 at macobio0 offset 0x4: version 0x4614 feature 3f0302 LE > > macgpio0 at macobio0 offset 0x50 > > "modem-reset" at macgpio0 offset 0x1d not configured > > "modem-power" at macgpio0 offset 0x1c not configured > > macgpio1 at macgpio0 offset 0x9: irq 47 > > "programmer-switch" at macgpio0 offset 0x11 not configured > > "gpio5" at macgpio0 offset 0x6f not configured > > "gpio6" at macgpio0 offset 0x70 not configured > > "extint-gpio15" at macgpio0 offset 0x67 not configured > > "escc-legacy" at macobio0 offset 0x12000 not configured > > zs0 at macobio0 offset 0x13000: irq 22,23 > > zstty0 at zs0 channel 0 > > zstty1 at zs0 channel 1 > > aoa0 at macobio0 offset 0x1: irq 30,1,2 > > "timer" at macobio0 offset 0x15000 not configured > > adb0 at macobio0 offset 0x16000 > > apm0 at adb0: battery flags 0x0, 0% charged > > piic0 at adb0 > > iic1 at piic0 > > maxtmp0 at iic1 addr 0xc8: max6642 > > kiic1 at macobio0 offset 0x18000 > > iic2 at kiic1 > > wdc0 at macobio0 offset 0x2 irq 24: DMA > > audio0 at aoa0 > > ohci0 at pci1 dev 26 function 0 "Apple Intrepid USB" rev 0x00: irq 29, > > version 1.0, legacy support > > ohci1 at pci1 dev 27 function 0 "NEC USB" rev 0x43: irq 63, version 1.0 > > ohci2 at pci1 dev 27 function 1 "NEC USB" rev 0x43: irq 63, version 1.0 > > ehci0 at pci1 dev 27 function 2 "NEC USB" rev 0x04: irq 63 > > usb0 at ehci0: USB revision 2.0 > > uhub0 at usb0 configuration 1 interface 0 "NEC EHCI root hub" rev 2.00/1.00 > > addr 1 > > usb1 at ohci0: USB revision 1.0 > > uhub1 at usb1 configuration 1 interface 0 "Apple OHCI root hub" rev > > 1.00/1.00 addr 1 > > usb2 at ohci1: USB revision 1.0 > > uhub2 at usb2 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 > > addr 1 > > usb3 at ohci2: USB revision 1.0 > > uhub3 at usb3 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 > > addr 1 > > mpcpcibr2 at mainbus0 pci: uni-north > > pci2 at mpcpcibr2 bus 0 > > kauaiata0 at pci2 dev 13 function 0 "Apple Intrepid ATA" rev 0x00 > > wdc1 at kauaiata0 irq 39: DMA > > wd0 at wdc1 channel 0 drive 0: > > wd0: 16-sector PIO, LBA48, 152627MB, 312581808 sectors > > wd0(wdc1:0:0): using PIO mode 4, DMA mode 2, Ultra-DMA mode 5 > > "Apple UniNorth Firewire" rev 0x81 at pci2 dev 14 function 0 not configured > > gem0 at pci2 dev 15 function 0 "Apple Uni-N2 GMAC" rev
Re: request for testing: malloc and large allocations
On Sat, Feb 05, 2022 at 05:22:50PM +0100, Jan Stary wrote: > On Feb 02 00:04:37, alexander.bl...@gmx.net wrote: > > On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote: > > > > Are you running with any malloc flags? > > > > > > This bug report enabled me to find a bug that would pop up if G mode > > > is enabled. > > > > > > New diff below. New tests appreciated. > > > Passed a make build on macppc (Mac Mini, dmesg below). > > 4288m32.43s real 3309m41.24s user 802m02.37s system > Recompiling now with the new malloc to see the difference. Note that during a make build, libs get installed after building them so all dynamically linked programs will use the new malloc from that point, as there is no revision bump. -Otto > > > Jan > > > [ using 1308496 bytes of bsd ELF symbol table ] > console out [ATY,RockHopper2_A] console in [keyboard], using USB > using parent ATY,RockHopper2Paren:: memaddr 9800, size 800 : consaddr > 9c008000 : ioaddr 9002, size 2: width 1920 linebytes 2048 height 1080 > depth 8 > Copyright (c) 1982, 1986, 1989, 1991, 1993 > The Regents of the University of California. All rights reserved. > Copyright (c) 1995-2022 OpenBSD. All rights reserved. https://www.OpenBSD.org > > OpenBSD 7.0-current (GENERIC) #0: Mon Jan 31 13:22:27 CET 2022 > h...@ppc.stare.cz:/usr/src/sys/arch/macppc/compile/GENERIC > real mem = 1073741824 (1024MB) > avail mem = 1025527808 (978MB) > random: good seed from bootblocks > mpath0 at root > scsibus0 at mpath0: 256 targets > mainbus0 at root: model PowerMac10,2 > cpu0 at mainbus0: 7447A (Revision 0x102): 1499 MHz: 512KB L2 cache > mem0 at mainbus0 > spdmem0 at mem0: 1GB DDR SDRAM non-parity PC3200CL3.0 > memc0 at mainbus0: uni-n rev 0xd2 > "hw-clock" at memc0 not configured > kiic0 at memc0 offset 0xf8001000 > iic0 at kiic0 > mpcpcibr0 at mainbus0 pci: uni-north > pci0 at mpcpcibr0 bus 0 > pchb0 at pci0 dev 11 function 0 "Apple UniNorth AGP" rev 0x00 > agp at pchb0 not configured > radeondrm0 at pci0 dev 16 function 0 "ATI Radeon 9200" rev 0x01 > drm0 at radeondrm0 > radeondrm0: irq 48 > mpcpcibr1 at mainbus0 pci: uni-north > pci1 at mpcpcibr1 bus 0 > macobio0 at pci1 dev 23 function 0 "Apple Intrepid" rev 0x00 > openpic0 at macobio0 offset 0x4: version 0x4614 feature 3f0302 LE > macgpio0 at macobio0 offset 0x50 > "modem-reset" at macgpio0 offset 0x1d not configured > "modem-power" at macgpio0 offset 0x1c not configured > macgpio1 at macgpio0 offset 0x9: irq 47 > "programmer-switch" at macgpio0 offset 0x11 not configured > "gpio5" at macgpio0 offset 0x6f not configured > "gpio6" at macgpio0 offset 0x70 not configured > "extint-gpio15" at macgpio0 offset 0x67 not configured > "escc-legacy" at macobio0 offset 0x12000 not configured > zs0 at macobio0 offset 0x13000: irq 22,23 > zstty0 at zs0 channel 0 > zstty1 at zs0 channel 1 > aoa0 at macobio0 offset 0x1: irq 30,1,2 > "timer" at macobio0 offset 0x15000 not configured > adb0 at macobio0 offset 0x16000 > apm0 at adb0: battery flags 0x0, 0% charged > piic0 at adb0 > iic1 at piic0 > maxtmp0 at iic1 addr 0xc8: max6642 > kiic1 at macobio0 offset 0x18000 > iic2 at kiic1 > wdc0 at macobio0 offset 0x2 irq 24: DMA > audio0 at aoa0 > ohci0 at pci1 dev 26 function 0 "Apple Intrepid USB" rev 0x00: irq 29, > version 1.0, legacy support > ohci1 at pci1 dev 27 function 0 "NEC USB" rev 0x43: irq 63, version 1.0 > ohci2 at pci1 dev 27 function 1 "NEC USB" rev 0x43: irq 63, version 1.0 > ehci0 at pci1 dev 27 function 2 "NEC USB" rev 0x04: irq 63 > usb0 at ehci0: USB revision 2.0 > uhub0 at usb0 configuration 1 interface 0 "NEC EHCI root hub" rev 2.00/1.00 > addr 1 > usb1 at ohci0: USB revision 1.0 > uhub1 at usb1 configuration 1 interface 0 "Apple OHCI root hub" rev 1.00/1.00 > addr 1 > usb2 at ohci1: USB revision 1.0 > uhub2 at usb2 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 > addr 1 > usb3 at ohci2: USB revision 1.0 > uhub3 at usb3 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 > addr 1 > mpcpcibr2 at mainbus0 pci: uni-north > pci2 at mpcpcibr2 bus 0 > kauaiata0 at pci2 dev 13 function 0 "Apple Intrepid ATA" rev 0x00 > wdc1 at kauaiata0 irq 39: DMA > wd0 at wdc1 channel 0 drive 0: > wd0: 16-sector PIO, LBA48, 152627MB, 312581808 sectors > wd0(wdc1:0:0): using PIO mode 4, DMA mode 2, Ultra-DMA mode 5 > "Apple UniNorth Firewire" rev 0x81 at pci2 dev 14 function 0 not configured > gem0 at pci2 dev 15 function 0 "Apple Uni-N2 GMAC" rev 0x80: irq 41, address > 00:14:51:17:42:34 > bmtphy0 at gem0 phy 0: BCM5221 100baseTX PHY, rev. 4 > uhub4 at uhub3 port 1 configuration 1 interface 0 "Mitsumi Electric Hub in > Apple Extended USB Keyboard" rev 1.10/4.10 addr 2 > uhidev0 at uhub4 port 3 configuration 1 interface 0 "Mitsumi Electric Apple > Extended USB Keyboard" rev 1.10/4.10 addr 3 > uhidev0: iclass 3/1 > ukbd0 at uhidev0: 8 variable keys, 6 key codes, country code 13 > wskbd0 at ukbd0: console keyboard > uhidev1 at
Re: request for testing: malloc and large allocations
On Feb 02 00:04:37, alexander.bl...@gmx.net wrote: > On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote: > > > Are you running with any malloc flags? > > > > This bug report enabled me to find a bug that would pop up if G mode > > is enabled. > > > > New diff below. New tests appreciated. Passed a make build on macppc (Mac Mini, dmesg below). 4288m32.43s real 3309m41.24s user 802m02.37s system Recompiling now with the new malloc to see the difference. Jan [ using 1308496 bytes of bsd ELF symbol table ] console out [ATY,RockHopper2_A] console in [keyboard], using USB using parent ATY,RockHopper2Paren:: memaddr 9800, size 800 : consaddr 9c008000 : ioaddr 9002, size 2: width 1920 linebytes 2048 height 1080 depth 8 Copyright (c) 1982, 1986, 1989, 1991, 1993 The Regents of the University of California. All rights reserved. Copyright (c) 1995-2022 OpenBSD. All rights reserved. https://www.OpenBSD.org OpenBSD 7.0-current (GENERIC) #0: Mon Jan 31 13:22:27 CET 2022 h...@ppc.stare.cz:/usr/src/sys/arch/macppc/compile/GENERIC real mem = 1073741824 (1024MB) avail mem = 1025527808 (978MB) random: good seed from bootblocks mpath0 at root scsibus0 at mpath0: 256 targets mainbus0 at root: model PowerMac10,2 cpu0 at mainbus0: 7447A (Revision 0x102): 1499 MHz: 512KB L2 cache mem0 at mainbus0 spdmem0 at mem0: 1GB DDR SDRAM non-parity PC3200CL3.0 memc0 at mainbus0: uni-n rev 0xd2 "hw-clock" at memc0 not configured kiic0 at memc0 offset 0xf8001000 iic0 at kiic0 mpcpcibr0 at mainbus0 pci: uni-north pci0 at mpcpcibr0 bus 0 pchb0 at pci0 dev 11 function 0 "Apple UniNorth AGP" rev 0x00 agp at pchb0 not configured radeondrm0 at pci0 dev 16 function 0 "ATI Radeon 9200" rev 0x01 drm0 at radeondrm0 radeondrm0: irq 48 mpcpcibr1 at mainbus0 pci: uni-north pci1 at mpcpcibr1 bus 0 macobio0 at pci1 dev 23 function 0 "Apple Intrepid" rev 0x00 openpic0 at macobio0 offset 0x4: version 0x4614 feature 3f0302 LE macgpio0 at macobio0 offset 0x50 "modem-reset" at macgpio0 offset 0x1d not configured "modem-power" at macgpio0 offset 0x1c not configured macgpio1 at macgpio0 offset 0x9: irq 47 "programmer-switch" at macgpio0 offset 0x11 not configured "gpio5" at macgpio0 offset 0x6f not configured "gpio6" at macgpio0 offset 0x70 not configured "extint-gpio15" at macgpio0 offset 0x67 not configured "escc-legacy" at macobio0 offset 0x12000 not configured zs0 at macobio0 offset 0x13000: irq 22,23 zstty0 at zs0 channel 0 zstty1 at zs0 channel 1 aoa0 at macobio0 offset 0x1: irq 30,1,2 "timer" at macobio0 offset 0x15000 not configured adb0 at macobio0 offset 0x16000 apm0 at adb0: battery flags 0x0, 0% charged piic0 at adb0 iic1 at piic0 maxtmp0 at iic1 addr 0xc8: max6642 kiic1 at macobio0 offset 0x18000 iic2 at kiic1 wdc0 at macobio0 offset 0x2 irq 24: DMA audio0 at aoa0 ohci0 at pci1 dev 26 function 0 "Apple Intrepid USB" rev 0x00: irq 29, version 1.0, legacy support ohci1 at pci1 dev 27 function 0 "NEC USB" rev 0x43: irq 63, version 1.0 ohci2 at pci1 dev 27 function 1 "NEC USB" rev 0x43: irq 63, version 1.0 ehci0 at pci1 dev 27 function 2 "NEC USB" rev 0x04: irq 63 usb0 at ehci0: USB revision 2.0 uhub0 at usb0 configuration 1 interface 0 "NEC EHCI root hub" rev 2.00/1.00 addr 1 usb1 at ohci0: USB revision 1.0 uhub1 at usb1 configuration 1 interface 0 "Apple OHCI root hub" rev 1.00/1.00 addr 1 usb2 at ohci1: USB revision 1.0 uhub2 at usb2 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 addr 1 usb3 at ohci2: USB revision 1.0 uhub3 at usb3 configuration 1 interface 0 "NEC OHCI root hub" rev 1.00/1.00 addr 1 mpcpcibr2 at mainbus0 pci: uni-north pci2 at mpcpcibr2 bus 0 kauaiata0 at pci2 dev 13 function 0 "Apple Intrepid ATA" rev 0x00 wdc1 at kauaiata0 irq 39: DMA wd0 at wdc1 channel 0 drive 0: wd0: 16-sector PIO, LBA48, 152627MB, 312581808 sectors wd0(wdc1:0:0): using PIO mode 4, DMA mode 2, Ultra-DMA mode 5 "Apple UniNorth Firewire" rev 0x81 at pci2 dev 14 function 0 not configured gem0 at pci2 dev 15 function 0 "Apple Uni-N2 GMAC" rev 0x80: irq 41, address 00:14:51:17:42:34 bmtphy0 at gem0 phy 0: BCM5221 100baseTX PHY, rev. 4 uhub4 at uhub3 port 1 configuration 1 interface 0 "Mitsumi Electric Hub in Apple Extended USB Keyboard" rev 1.10/4.10 addr 2 uhidev0 at uhub4 port 3 configuration 1 interface 0 "Mitsumi Electric Apple Extended USB Keyboard" rev 1.10/4.10 addr 3 uhidev0: iclass 3/1 ukbd0 at uhidev0: 8 variable keys, 6 key codes, country code 13 wskbd0 at ukbd0: console keyboard uhidev1 at uhub4 port 3 configuration 1 interface 1 "Mitsumi Electric Apple Extended USB Keyboard" rev 1.10/4.10 addr 3 uhidev1: iclass 3/0, 3 report ids uhid0 at uhidev1 reportid 2: input=1, output=0, feature=0 ucc0 at uhidev1 reportid 3: 4 usages, 4 keys, enum wskbd1 at ucc0 mux 1 vscsi0 at root scsibus1 at vscsi0: 256 targets softraid0 at root scsibus2 at softraid0: 256 targets bootpath: /pci@f400/ata-6@d/disk@0:/bsd root on wd0a (64c5f028a86139ec.a) swap on wd0b dump on wd0b
Re: request for testing: malloc and large allocations
On Tue, Feb 01, 2022 at 08:00:36AM +0100, Otto Moerbeek wrote: > > Are you running with any malloc flags? > > This bug report enabled me to find a bug that would pop up if G mode > is enabled. > > New diff below. New tests appreciated. It passed a full regress run on amd64 with vm.malloc_conf CFGJU bluhm > Index: stdlib/malloc.c > === > RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v > retrieving revision 1.272 > diff -u -p -r1.272 malloc.c > --- stdlib/malloc.c 19 Sep 2021 09:15:22 - 1.272 > +++ stdlib/malloc.c 31 Jan 2022 16:27:31 - > @@ -113,13 +113,27 @@ struct region_info { > > LIST_HEAD(chunk_head, chunk_info); > > -#define MAX_CACHEABLE_SIZE 32 > -struct cache { > - void *pages[MALLOC_MAXCACHE]; > +/* > + * Two caches, one for "small" regions, one for "big". > + * Small cache is an array per size, big cache is one array with different > + * sized regions > + */ > +#define MAX_SMALLCACHEABLE_SIZE 32 > +#define MAX_BIGCACHEABLE_SIZE512 > +/* If the total # of pages is larger than this, evict before inserting */ > +#define BIGCACHE_FILL(sz)(MAX_BIGCACHEABLE_SIZE * (sz) / 4) > + > +struct smallcache { > + void **pages; > ushort length; > ushort max; > }; > > +struct bigcache { > + void *page; > + size_t psize; > +}; > + > struct dir_info { > u_int32_t canary1; > int active; /* status of malloc */ > @@ -139,7 +153,10 @@ struct dir_info { > void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1]; > u_char rbytes[32]; /* random bytes */ > /* free pages cache */ > - struct cache cache[MAX_CACHEABLE_SIZE]; > + struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE]; > + size_t bigcache_used; > + size_t bigcache_size; > + struct bigcache *bigcache; > #ifdef MALLOC_STATS > size_t inserts; > size_t insert_collisions; > @@ -207,7 +224,7 @@ struct malloc_readonly { > #ifdef MALLOC_STATS > int malloc_stats; /* dump statistics at end */ > #endif > - u_int32_t malloc_canary;/* Matched against ones in malloc_pool > */ > + u_int32_t malloc_canary;/* Matched against ones in pool */ > }; > > /* This object is mapped PROT_READ after initialisation to prevent tampering > */ > @@ -714,18 +731,61 @@ unmap(struct dir_info *d, void *p, size_ > size_t psz = sz >> MALLOC_PAGESHIFT; > void *r; > u_short i; > - struct cache *cache; > + struct smallcache *cache; > > if (sz != PAGEROUND(sz) || psz == 0) > wrterror(d, "munmap round"); > > - if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) { > + if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE && > + psz <= MAX_BIGCACHEABLE_SIZE) { > + u_short base = getrbyte(d); > + u_short j; > + > + /* don't look through all slots */ > + for (j = 0; j < d->bigcache_size / 4; j++) { > + i = (base + j) % d->bigcache_size; > + if (d->bigcache_used < > + BIGCACHE_FILL(d->bigcache_size)) { > + if (d->bigcache[i].psize == 0) > + break; > + } else { > + if (d->bigcache[i].psize != 0) > + break; > + } > + } > + /* if we didn't find a preferred slot, use random one */ > + if (d->bigcache[i].psize != 0) { > + size_t tmp; > + > + r = d->bigcache[i].page; > + d->bigcache_used -= d->bigcache[i].psize; > + tmp = d->bigcache[i].psize << MALLOC_PAGESHIFT; > + if (!mopts.malloc_freeunmap) > + validate_junk(d, r, tmp); > + if (munmap(r, tmp)) > + wrterror(d, "munmap %p", r); > + STATS_SUB(d->malloc_used, tmp); > + } > + > + if (clear > 0) > + explicit_bzero(p, clear); > + if (mopts.malloc_freeunmap) { > + if (mprotect(p, sz, PROT_NONE)) > + wrterror(d, "mprotect %p", r); > + } else > + junk_free(d->malloc_junk, p, sz); > + d->bigcache[i].page = p; > + d->bigcache[i].psize = psz; > + d->bigcache_used += psz; > + return; > + } > + if (psz > MAX_SMALLCACHEABLE_SIZE || d->smallcache[psz - 1].max == 0) { > if (munmap(p, sz)) > wrterror(d, "munmap %p", p); > STATS_SUB(d->malloc_used, sz); > return; > } > - cache = >cache[psz - 1]; > + cache =
Re: request for testing: malloc and large allocations
On Fri, Jan 28, 2022 at 05:17:48PM +0100, Otto Moerbeek wrote: > On Fri, Jan 28, 2022 at 04:33:28PM +0100, Alexander Bluhm wrote: > > > On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote: > > > currently malloc does cache a number of free'ed regions up to 128k in > > > size. This cache is indexed by size (in # of pages), so it is very > > > quick to check. > > > > > > Some programs allocate and deallocate larger allocations in a frantic > > > way. Accodomate those programs by also keeping a cache of regions > > > betwen 128k and 2M, in a cache of variable sized regions. > > > > > > My test case speeds up about twice. A make build gets a small speedup. > > > > > > This has been tested by myself on amd64 quite intensively. I am asking > > > for more tests, especialy on more "exotic" platforms. I wil do arm64 > > > myself soon. Test can be running your favorite programs, doing make > > > builds or running tests in regress/lib/libc/malloc. > > > > I see openssl and tmux crash with this diff. > > /usr/src/regress/usr.sbin/openssl reproduces it on arm64, amd64, > > i386. > > Are you running with any malloc flags? This bug report enabled me to find a bug that would pop up if G mode is enabled. New diff below. New tests appreciated. -Otto Index: stdlib/malloc.c === RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v retrieving revision 1.272 diff -u -p -r1.272 malloc.c --- stdlib/malloc.c 19 Sep 2021 09:15:22 - 1.272 +++ stdlib/malloc.c 31 Jan 2022 16:27:31 - @@ -113,13 +113,27 @@ struct region_info { LIST_HEAD(chunk_head, chunk_info); -#define MAX_CACHEABLE_SIZE 32 -struct cache { - void *pages[MALLOC_MAXCACHE]; +/* + * Two caches, one for "small" regions, one for "big". + * Small cache is an array per size, big cache is one array with different + * sized regions + */ +#define MAX_SMALLCACHEABLE_SIZE32 +#define MAX_BIGCACHEABLE_SIZE 512 +/* If the total # of pages is larger than this, evict before inserting */ +#define BIGCACHE_FILL(sz) (MAX_BIGCACHEABLE_SIZE * (sz) / 4) + +struct smallcache { + void **pages; ushort length; ushort max; }; +struct bigcache { + void *page; + size_t psize; +}; + struct dir_info { u_int32_t canary1; int active; /* status of malloc */ @@ -139,7 +153,10 @@ struct dir_info { void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1]; u_char rbytes[32]; /* random bytes */ /* free pages cache */ - struct cache cache[MAX_CACHEABLE_SIZE]; + struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE]; + size_t bigcache_used; + size_t bigcache_size; + struct bigcache *bigcache; #ifdef MALLOC_STATS size_t inserts; size_t insert_collisions; @@ -207,7 +224,7 @@ struct malloc_readonly { #ifdef MALLOC_STATS int malloc_stats; /* dump statistics at end */ #endif - u_int32_t malloc_canary;/* Matched against ones in malloc_pool */ + u_int32_t malloc_canary;/* Matched against ones in pool */ }; /* This object is mapped PROT_READ after initialisation to prevent tampering */ @@ -714,18 +731,61 @@ unmap(struct dir_info *d, void *p, size_ size_t psz = sz >> MALLOC_PAGESHIFT; void *r; u_short i; - struct cache *cache; + struct smallcache *cache; if (sz != PAGEROUND(sz) || psz == 0) wrterror(d, "munmap round"); - if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) { + if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE && + psz <= MAX_BIGCACHEABLE_SIZE) { + u_short base = getrbyte(d); + u_short j; + + /* don't look through all slots */ + for (j = 0; j < d->bigcache_size / 4; j++) { + i = (base + j) % d->bigcache_size; + if (d->bigcache_used < + BIGCACHE_FILL(d->bigcache_size)) { + if (d->bigcache[i].psize == 0) + break; + } else { + if (d->bigcache[i].psize != 0) + break; + } + } + /* if we didn't find a preferred slot, use random one */ + if (d->bigcache[i].psize != 0) { + size_t tmp; + + r = d->bigcache[i].page; + d->bigcache_used -= d->bigcache[i].psize; + tmp = d->bigcache[i].psize << MALLOC_PAGESHIFT; + if (!mopts.malloc_freeunmap) + validate_junk(d, r, tmp); + if (munmap(r, tmp)) +wrterror(d, "munmap %p",
Re: request for testing: malloc and large allocations
On Fri, Jan 28, 2022 at 04:33:28PM +0100, Alexander Bluhm wrote: > On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote: > > currently malloc does cache a number of free'ed regions up to 128k in > > size. This cache is indexed by size (in # of pages), so it is very > > quick to check. > > > > Some programs allocate and deallocate larger allocations in a frantic > > way. Accodomate those programs by also keeping a cache of regions > > betwen 128k and 2M, in a cache of variable sized regions. > > > > My test case speeds up about twice. A make build gets a small speedup. > > > > This has been tested by myself on amd64 quite intensively. I am asking > > for more tests, especialy on more "exotic" platforms. I wil do arm64 > > myself soon. Test can be running your favorite programs, doing make > > builds or running tests in regress/lib/libc/malloc. > > I see openssl and tmux crash with this diff. > /usr/src/regress/usr.sbin/openssl reproduces it on arm64, amd64, > i386. Are you running with any malloc flags? -Otto > > #0 L1 () at /usr/src/lib/libc/arch/amd64/string/memset.S:52 > 52 L1: rep > (gdb) bt > #0 L1 () at /usr/src/lib/libc/arch/amd64/string/memset.S:52 > #1 0x0fde4ed058d8 in _libc_explicit_bzero (buf=0xfded1f8e000, > len=18446744073709549600) at /usr/src/lib/libc/string/explicit_bzero.c:17 > #2 0x0fde4ed6f84f in unmap (d=0xfdead893830, p=Variable "p" is not > available. > ) at /usr/src/lib/libc/stdlib/malloc.c:805 > #3 0x0fde4ed6ceca in ofree (argpool=0x7f7c2268, p=Variable "p" is > not available. > ) at /usr/src/lib/libc/stdlib/malloc.c:1511 > #4 0x0fde4ed6e4cb in _libc_recallocarray (ptr=0xfded1f8e7e0, > oldnmemb=Variable "oldnmemb" is not available. > ) at /usr/src/lib/libc/stdlib/malloc.c:1908 > #5 0x0fdbd4787afe in xrecallocarray (ptr=Unhandled dwarf expression > opcode 0xa3 > ) at /usr/src/usr.bin/tmux/xmalloc.c:81 > #6 0x0fdbd4703ce6 in cmd_parse_build_commands (cmds=Unhandled dwarf > expression opcode 0xa3 > ) at cmd-parse.y:815 > #7 0x0fdbd4703d47 in cmd_parse_build_commands (cmds=Unhandled dwarf > expression opcode 0xa3 > ) at cmd-parse.y:823 > #8 0x0fdbd47040f7 in cmd_parse_from_buffer (buf=Unhandled dwarf > expression opcode 0xa3 > ) at cmd-parse.y:1036 > #9 0x0fdbd4703ffd in cmd_parse_from_string ( > s=0xfdbd46d6522 "bind > { display-menu -xP -yP -T > '#[align=centre]#{pane_index} (#{pane_id})' > '#{?#{m/r:(copy|view)-mode,#{pane_mode}},Go To Top,}' '<' {send -X > history-top} '#{?#{m/r:(copy|view)-mode,#{pane_mode}},G"..., > pi=0x7f7c24a0) at cmd-parse.y:959 > #10 0x0fdbd473506e in key_bindings_init () at > /usr/src/usr.bin/tmux/key-bindings.c:636 > #11 0x0fdbd47564c8 in server_start (client=0xfdec1056000, > flags=402653184, base=0xfdec103f400, lockfd=5, lockfile=0xfdec1041b80 > "/tmp/tmux-0/default.lock") > at /usr/src/usr.bin/tmux/server.c:210 > #12 0x0fdbd46f8788 in client_main (base=0xfdec103f400, argc=Unhandled > dwarf expression opcode 0xa3 > ) at /usr/src/usr.bin/tmux/client.c:165 > #13 0x0fdbd4761b62 in main (argc=0, argv=0x7f7c2880) at > /usr/src/usr.bin/tmux/tmux.c:529 > > bluhm
Re: request for testing: malloc and large allocations
On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote: > currently malloc does cache a number of free'ed regions up to 128k in > size. This cache is indexed by size (in # of pages), so it is very > quick to check. > > Some programs allocate and deallocate larger allocations in a frantic > way. Accodomate those programs by also keeping a cache of regions > betwen 128k and 2M, in a cache of variable sized regions. > > My test case speeds up about twice. A make build gets a small speedup. > > This has been tested by myself on amd64 quite intensively. I am asking > for more tests, especialy on more "exotic" platforms. I wil do arm64 > myself soon. Test can be running your favorite programs, doing make > builds or running tests in regress/lib/libc/malloc. I see openssl and tmux crash with this diff. /usr/src/regress/usr.sbin/openssl reproduces it on arm64, amd64, i386. #0 L1 () at /usr/src/lib/libc/arch/amd64/string/memset.S:52 52 L1: rep (gdb) bt #0 L1 () at /usr/src/lib/libc/arch/amd64/string/memset.S:52 #1 0x0fde4ed058d8 in _libc_explicit_bzero (buf=0xfded1f8e000, len=18446744073709549600) at /usr/src/lib/libc/string/explicit_bzero.c:17 #2 0x0fde4ed6f84f in unmap (d=0xfdead893830, p=Variable "p" is not available. ) at /usr/src/lib/libc/stdlib/malloc.c:805 #3 0x0fde4ed6ceca in ofree (argpool=0x7f7c2268, p=Variable "p" is not available. ) at /usr/src/lib/libc/stdlib/malloc.c:1511 #4 0x0fde4ed6e4cb in _libc_recallocarray (ptr=0xfded1f8e7e0, oldnmemb=Variable "oldnmemb" is not available. ) at /usr/src/lib/libc/stdlib/malloc.c:1908 #5 0x0fdbd4787afe in xrecallocarray (ptr=Unhandled dwarf expression opcode 0xa3 ) at /usr/src/usr.bin/tmux/xmalloc.c:81 #6 0x0fdbd4703ce6 in cmd_parse_build_commands (cmds=Unhandled dwarf expression opcode 0xa3 ) at cmd-parse.y:815 #7 0x0fdbd4703d47 in cmd_parse_build_commands (cmds=Unhandled dwarf expression opcode 0xa3 ) at cmd-parse.y:823 #8 0x0fdbd47040f7 in cmd_parse_from_buffer (buf=Unhandled dwarf expression opcode 0xa3 ) at cmd-parse.y:1036 #9 0x0fdbd4703ffd in cmd_parse_from_string ( s=0xfdbd46d6522 "bind > { display-menu -xP -yP -T '#[align=centre]#{pane_index} (#{pane_id})' '#{?#{m/r:(copy|view)-mode,#{pane_mode}},Go To Top,}' '<' {send -X history-top} '#{?#{m/r:(copy|view)-mode,#{pane_mode}},G"..., pi=0x7f7c24a0) at cmd-parse.y:959 #10 0x0fdbd473506e in key_bindings_init () at /usr/src/usr.bin/tmux/key-bindings.c:636 #11 0x0fdbd47564c8 in server_start (client=0xfdec1056000, flags=402653184, base=0xfdec103f400, lockfd=5, lockfile=0xfdec1041b80 "/tmp/tmux-0/default.lock") at /usr/src/usr.bin/tmux/server.c:210 #12 0x0fdbd46f8788 in client_main (base=0xfdec103f400, argc=Unhandled dwarf expression opcode 0xa3 ) at /usr/src/usr.bin/tmux/client.c:165 #13 0x0fdbd4761b62 in main (argc=0, argv=0x7f7c2880) at /usr/src/usr.bin/tmux/tmux.c:529 bluhm
Re: request for testing: malloc and large allocations
On Sat, Jan 22, 2022 at 09:25:25AM +0100, Otto Moerbeek wrote: > On Mon, Jan 17, 2022 at 08:42:47AM +0100, Otto Moerbeek wrote: > > > On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote: > > > > > Hi, > > > > > > currently malloc does cache a number of free'ed regions up to 128k in > > > size. This cache is indexed by size (in # of pages), so it is very > > > quick to check. > > > > > > Some programs allocate and deallocate larger allocations in a frantic > > > way. Accodomate those programs by also keeping a cache of regions > > > betwen 128k and 2M, in a cache of variable sized regions. > > > > > > My test case speeds up about twice. A make build gets a small speedup. > > > > > > This has been tested by myself on amd64 quite intensively. I am asking > > > for more tests, especialy on more "exotic" platforms. I wil do arm64 > > > myself soon. Test can be running your favorite programs, doing make > > > builds or running tests in regress/lib/libc/malloc. > > > > > > Thanks in advance! > > > > > > I received several success reports and one failure on macppc report > > from Miod. I'm investiging that report to see if this diff can be > > blamed. > > > > In the meantime: keep on testing! > > > > -Otto > > So far I have been unable to reproduce. I would like confirmation > either way from somebody else having a macppc machine. Any volunteer? > > Thanks, > > -Otto It turns out the macppc troubles have ben resolved by an uvm commit. -Otto
Re: request for testing: malloc and large allocations
On Mon, Jan 17, 2022 at 08:42:47AM +0100, Otto Moerbeek wrote: > On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote: > > > Hi, > > > > currently malloc does cache a number of free'ed regions up to 128k in > > size. This cache is indexed by size (in # of pages), so it is very > > quick to check. > > > > Some programs allocate and deallocate larger allocations in a frantic > > way. Accodomate those programs by also keeping a cache of regions > > betwen 128k and 2M, in a cache of variable sized regions. > > > > My test case speeds up about twice. A make build gets a small speedup. > > > > This has been tested by myself on amd64 quite intensively. I am asking > > for more tests, especialy on more "exotic" platforms. I wil do arm64 > > myself soon. Test can be running your favorite programs, doing make > > builds or running tests in regress/lib/libc/malloc. > > > > Thanks in advance! > > > I received several success reports and one failure on macppc report > from Miod. I'm investiging that report to see if this diff can be > blamed. > > In the meantime: keep on testing! > > -Otto So far I have been unable to reproduce. I would like confirmation either way from somebody else having a macppc machine. Any volunteer? Thanks, -Otto
Re: request for testing: malloc and large allocations
On Sun, Jan 09, 2022 at 02:54:43PM +0100, Otto Moerbeek wrote: > Hi, > > currently malloc does cache a number of free'ed regions up to 128k in > size. This cache is indexed by size (in # of pages), so it is very > quick to check. > > Some programs allocate and deallocate larger allocations in a frantic > way. Accodomate those programs by also keeping a cache of regions > betwen 128k and 2M, in a cache of variable sized regions. > > My test case speeds up about twice. A make build gets a small speedup. > > This has been tested by myself on amd64 quite intensively. I am asking > for more tests, especialy on more "exotic" platforms. I wil do arm64 > myself soon. Test can be running your favorite programs, doing make > builds or running tests in regress/lib/libc/malloc. > > Thanks in advance! I received several success reports and one failure on macppc report from Miod. I'm investiging that report to see if this diff can be blamed. In the meantime: keep on testing! -Otto
Re: request for testing: malloc and large allocations
Hey, On 1/9/22 14:54, Otto Moerbeek wrote: currently malloc does cache a number of free'ed regions up to 128k in size. This cache is indexed by size (in # of pages), so it is very quick to check. Some programs allocate and deallocate larger allocations in a frantic way. Accodomate those programs by also keeping a cache of regions betwen 128k and 2M, in a cache of variable sized regions. My test case speeds up about twice. A make build gets a small speedup. This has been tested by myself on amd64 quite intensively. I am asking for more tests, especialy on more "exotic" platforms. I wil do arm64 myself soon. Test can be running your favorite programs, doing make builds or running tests in regress/lib/libc/malloc. Thanks in advance! i did some workloads with your patch applied. I did not notice any speedups (but i also did not time it, just a suggestive feeling while waiting for cargo build to finish) To me everything works fine. I am on a normal amd64, sadly i dont have other hardware available. I hope this helps Greetings Leo
Re: request for testing: malloc and large allocations
Hi Otto, * Otto Moerbeek wrote: > Hi, > > currently malloc does cache a number of free'ed regions up to 128k in > size. This cache is indexed by size (in # of pages), so it is very > quick to check. > > Some programs allocate and deallocate larger allocations in a frantic > way. Accodomate those programs by also keeping a cache of regions > betwen 128k and 2M, in a cache of variable sized regions. > > My test case speeds up about twice. A make build gets a small speedup. > > This has been tested by myself on amd64 quite intensively. I am asking > for more tests, especialy on more "exotic" platforms. I wil do arm64 > myself soon. Test can be running your favorite programs, doing make > builds or running tests in regress/lib/libc/malloc. I have your patch running on an amd64 Thinkpad T450s with usual desktop usage and noticed no regression so far. Cheers Matthias
Re: request for testing: malloc and large allocations
On 2022/01/09 14:54, Otto Moerbeek wrote: > currently malloc does cache a number of free'ed regions up to 128k in > size. This cache is indexed by size (in # of pages), so it is very > quick to check. > > Some programs allocate and deallocate larger allocations in a frantic > way. Accodomate those programs by also keeping a cache of regions > betwen 128k and 2M, in a cache of variable sized regions. > > My test case speeds up about twice. A make build gets a small speedup. > > This has been tested by myself on amd64 quite intensively. I am asking > for more tests, especialy on more "exotic" platforms. I wil do arm64 > myself soon. Test can be running your favorite programs, doing make > builds or running tests in regress/lib/libc/malloc. This has been through mkr and ports bulk build on i386. Ports build times vary too much to say if it's faster there but no issues have been seen. > Thanks in advance! > > -Otto > > Index: stdlib/malloc.c > === > RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v > retrieving revision 1.272 > diff -u -p -r1.272 malloc.c > --- stdlib/malloc.c 19 Sep 2021 09:15:22 - 1.272 > +++ stdlib/malloc.c 9 Jan 2022 13:10:35 - > @@ -113,13 +113,28 @@ struct region_info { > > LIST_HEAD(chunk_head, chunk_info); > > -#define MAX_CACHEABLE_SIZE 32 > -struct cache { > - void *pages[MALLOC_MAXCACHE]; > +/* > + * Two caches, one for "small" regions, one for "big". > + * Small cacche is an array per size, big cache is one array with different > + * sizes regions > + */ > +#define MAX_SMALLCACHEABLE_SIZE 32 > +#define MAX_BIGCACHEABLE_SIZE512 > +#define BIGCACHE_SIZEMALLOC_MAXCACHE > +/* If the total # of pages is larger than this, evict before inserting */ > +#define BIGCACHE_FILL(sz)(MAX_BIGCACHEABLE_SIZE * (sz) / 4) > + > +struct smallcache { > + void **pages; > ushort length; > ushort max; > }; > > +struct bigcache { > + void *page; > + size_t psize; > +}; > + > struct dir_info { > u_int32_t canary1; > int active; /* status of malloc */ > @@ -139,7 +154,10 @@ struct dir_info { > void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1]; > u_char rbytes[32]; /* random bytes */ > /* free pages cache */ > - struct cache cache[MAX_CACHEABLE_SIZE]; > + struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE]; > + ushort bigcache_size; > + size_t bigcache_used; > + struct bigcache bigcache[BIGCACHE_SIZE]; > #ifdef MALLOC_STATS > size_t inserts; > size_t insert_collisions; > @@ -714,18 +732,61 @@ unmap(struct dir_info *d, void *p, size_ > size_t psz = sz >> MALLOC_PAGESHIFT; > void *r; > u_short i; > - struct cache *cache; > + struct smallcache *cache; > > if (sz != PAGEROUND(sz) || psz == 0) > wrterror(d, "munmap round"); > > - if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) { > + if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE && > + psz <= MAX_BIGCACHEABLE_SIZE) { > + u_short base = getrbyte(d); > + u_short j; > + > + /* don't look through all slots */ > + for (j = 0; j < d->bigcache_size / 4; j++) { > + i = (base + j) % d->bigcache_size; > + if (d->bigcache_used < > + BIGCACHE_FILL(d->bigcache_size)) { > + if (d->bigcache[i].psize == 0) > + break; > + } else { > + if (d->bigcache[i].psize != 0) > + break; > + } > + } > + /* if we didn't find a preferred slot, use random one */ > + if (d->bigcache[i].psize != 0) { > + size_t tmp; > + > + r = d->bigcache[i].page; > + d->bigcache_used -= d->bigcache[i].psize; > + tmp = d->bigcache[i].psize << MALLOC_PAGESHIFT; > + if (!mopts.malloc_freeunmap) > + validate_junk(d, r, tmp); > + if (munmap(r, tmp)) > + wrterror(d, "munmap %p", r); > + STATS_SUB(d->malloc_used, tmp); > + } > + > + if (clear > 0) > + explicit_bzero(p, clear); > + if (mopts.malloc_freeunmap) { > + if (mprotect(p, sz, PROT_NONE)) > + wrterror(d, "mprotect %p", r); > + } else > + junk_free(d->malloc_junk, p, sz); > + d->bigcache[i].page = p; > + d->bigcache[i].psize = psz; > + d->bigcache_used += psz; > + return; > + } > +
request for testing: malloc and large allocations
Hi, currently malloc does cache a number of free'ed regions up to 128k in size. This cache is indexed by size (in # of pages), so it is very quick to check. Some programs allocate and deallocate larger allocations in a frantic way. Accodomate those programs by also keeping a cache of regions betwen 128k and 2M, in a cache of variable sized regions. My test case speeds up about twice. A make build gets a small speedup. This has been tested by myself on amd64 quite intensively. I am asking for more tests, especialy on more "exotic" platforms. I wil do arm64 myself soon. Test can be running your favorite programs, doing make builds or running tests in regress/lib/libc/malloc. Thanks in advance! -Otto Index: stdlib/malloc.c === RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v retrieving revision 1.272 diff -u -p -r1.272 malloc.c --- stdlib/malloc.c 19 Sep 2021 09:15:22 - 1.272 +++ stdlib/malloc.c 9 Jan 2022 13:10:35 - @@ -113,13 +113,28 @@ struct region_info { LIST_HEAD(chunk_head, chunk_info); -#define MAX_CACHEABLE_SIZE 32 -struct cache { - void *pages[MALLOC_MAXCACHE]; +/* + * Two caches, one for "small" regions, one for "big". + * Small cacche is an array per size, big cache is one array with different + * sizes regions + */ +#define MAX_SMALLCACHEABLE_SIZE32 +#define MAX_BIGCACHEABLE_SIZE 512 +#define BIGCACHE_SIZE MALLOC_MAXCACHE +/* If the total # of pages is larger than this, evict before inserting */ +#define BIGCACHE_FILL(sz) (MAX_BIGCACHEABLE_SIZE * (sz) / 4) + +struct smallcache { + void **pages; ushort length; ushort max; }; +struct bigcache { + void *page; + size_t psize; +}; + struct dir_info { u_int32_t canary1; int active; /* status of malloc */ @@ -139,7 +154,10 @@ struct dir_info { void *delayed_chunks[MALLOC_DELAYED_CHUNK_MASK + 1]; u_char rbytes[32]; /* random bytes */ /* free pages cache */ - struct cache cache[MAX_CACHEABLE_SIZE]; + struct smallcache smallcache[MAX_SMALLCACHEABLE_SIZE]; + ushort bigcache_size; + size_t bigcache_used; + struct bigcache bigcache[BIGCACHE_SIZE]; #ifdef MALLOC_STATS size_t inserts; size_t insert_collisions; @@ -714,18 +732,61 @@ unmap(struct dir_info *d, void *p, size_ size_t psz = sz >> MALLOC_PAGESHIFT; void *r; u_short i; - struct cache *cache; + struct smallcache *cache; if (sz != PAGEROUND(sz) || psz == 0) wrterror(d, "munmap round"); - if (psz > MAX_CACHEABLE_SIZE || d->cache[psz - 1].max == 0) { + if (d->bigcache_size > 0 && psz > MAX_SMALLCACHEABLE_SIZE && + psz <= MAX_BIGCACHEABLE_SIZE) { + u_short base = getrbyte(d); + u_short j; + + /* don't look through all slots */ + for (j = 0; j < d->bigcache_size / 4; j++) { + i = (base + j) % d->bigcache_size; + if (d->bigcache_used < + BIGCACHE_FILL(d->bigcache_size)) { + if (d->bigcache[i].psize == 0) + break; + } else { + if (d->bigcache[i].psize != 0) + break; + } + } + /* if we didn't find a preferred slot, use random one */ + if (d->bigcache[i].psize != 0) { + size_t tmp; + + r = d->bigcache[i].page; + d->bigcache_used -= d->bigcache[i].psize; + tmp = d->bigcache[i].psize << MALLOC_PAGESHIFT; + if (!mopts.malloc_freeunmap) + validate_junk(d, r, tmp); + if (munmap(r, tmp)) +wrterror(d, "munmap %p", r); + STATS_SUB(d->malloc_used, tmp); + } + + if (clear > 0) + explicit_bzero(p, clear); + if (mopts.malloc_freeunmap) { + if (mprotect(p, sz, PROT_NONE)) + wrterror(d, "mprotect %p", r); + } else + junk_free(d->malloc_junk, p, sz); + d->bigcache[i].page = p; + d->bigcache[i].psize = psz; + d->bigcache_used += psz; + return; + } + if (psz > MAX_SMALLCACHEABLE_SIZE || d->smallcache[psz - 1].max == 0) { if (munmap(p, sz)) wrterror(d, "munmap %p", p); STATS_SUB(d->malloc_used, sz); return; } - cache = >cache[psz - 1]; + cache =