Module Name: src Committed By: christos Date: Sat Aug 27 16:23:44 UTC 2011
Modified Files: src/sys/arch/amd64/amd64: machdep.c src/sys/arch/amd64/include: pmap.h Log Message: Implement sparse dumps for amd64 (copied from i386). Disabled for now via sysctl. XXX: most of the code can be merged. To generate a diff of this commit: cvs rdiff -u -r1.164 -r1.165 src/sys/arch/amd64/amd64/machdep.c cvs rdiff -u -r1.25 -r1.26 src/sys/arch/amd64/include/pmap.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/amd64/amd64/machdep.c diff -u src/sys/arch/amd64/amd64/machdep.c:1.164 src/sys/arch/amd64/amd64/machdep.c:1.165 --- src/sys/arch/amd64/amd64/machdep.c:1.164 Thu Aug 11 14:11:17 2011 +++ src/sys/arch/amd64/amd64/machdep.c Sat Aug 27 12:23:44 2011 @@ -1,7 +1,7 @@ -/* $NetBSD: machdep.c,v 1.164 2011/08/11 18:11:17 cherry Exp $ */ +/* $NetBSD: machdep.c,v 1.165 2011/08/27 16:23:44 christos Exp $ */ /*- - * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008 + * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011 * The NetBSD Foundation, Inc. * All rights reserved. * @@ -9,6 +9,10 @@ * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace * Simulation Facility, NASA Ames Research Center. * + * This code is derived from software contributed to The NetBSD Foundation + * by Coyote Point Systems, Inc. which was written under contract to Coyote + * Point by Jed Davis and Devon O'Dell. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -107,7 +111,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.164 2011/08/11 18:11:17 cherry Exp $"); +__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.165 2011/08/27 16:23:44 christos Exp $"); /* #define XENDEBUG_LOW */ @@ -239,6 +243,25 @@ uint64_t dumpmem_high; int cpu_class; + +#ifndef NO_SPARSE_DUMP +int sparse_dump = 0; + +paddr_t max_paddr = 0; +unsigned char *sparse_dump_physmap; +#endif + +char *dump_headerbuf, *dump_headerbuf_ptr; +#define dump_headerbuf_size PAGE_SIZE +#define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size) +#define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr) +daddr_t dump_header_blkno; + +size_t dump_nmemsegs; +size_t dump_npages; +size_t dump_header_size; +size_t dump_totalbytesleft; + vaddr_t msgbuf_vaddr; paddr_t msgbuf_paddr; @@ -290,8 +313,28 @@ int cpu_dump(void); int cpu_dumpsize(void); u_long cpu_dump_mempagecnt(void); -void dumpsys(void); void dodumpsys(void); +void dumpsys(void); + +void dump_misc_init(void); +void dump_seg_prep(void); +int dump_seg_iter(int (*)(paddr_t, paddr_t)); + +#ifndef NO_SPARSE_DUMP +void sparse_dump_reset(void); +void sparse_dump_mark(vaddr_t, vaddr_t, int); +void cpu_dump_prep_sparse(void); +#endif + +void dump_header_start(void); +int dump_header_flush(void); +int dump_header_addbytes(const void*, size_t); +int dump_header_addseg(paddr_t, paddr_t); +int dump_header_finish(void); + +int dump_seg_count_range(paddr_t, paddr_t); +int dumpsys_seg(paddr_t, paddr_t); + void init_x86_64(paddr_t); /* @@ -530,6 +573,14 @@ SYSCTL_DESCR("Whether the kernel uses PAE"), NULL, 1, NULL, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); +#ifndef NO_SPARSE_DUMP + /* XXXjld Does this really belong under machdep, and not e.g. kern? */ + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "sparse_dump", NULL, + NULL, 0, &sparse_dump, 0, + CTL_MACHDEP, CTL_CREATE, CTL_EOL); +#endif } void @@ -746,6 +797,259 @@ * XXXfvdl share dumpcode. */ + /* + * Perform assorted dump-related initialization tasks. Assumes that + * the maximum physical memory address will not increase afterwards. + */ +void +dump_misc_init(void) +{ +#ifndef NO_SPARSE_DUMP + int i; +#endif + + if (dump_headerbuf != NULL) + return; /* already called */ + +#ifndef NO_SPARSE_DUMP + for (i = 0; i < mem_cluster_cnt; ++i) { + paddr_t top = mem_clusters[i].start + mem_clusters[i].size; + if (max_paddr < top) + max_paddr = top; + } +#ifdef DEBUG + printf("dump_misc_init: max_paddr = 0x%lx\n", + (unsigned long)max_paddr); +#endif + + sparse_dump_physmap = (void*)uvm_km_alloc(kernel_map, + roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE), + PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); +#endif + dump_headerbuf = (void*)uvm_km_alloc(kernel_map, + dump_headerbuf_size, + PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); + /* XXXjld should check for failure here, disable dumps if so. */ +} + +#ifndef NO_SPARSE_DUMP +/* + * Clear the set of pages to include in a sparse dump. + */ +void +sparse_dump_reset(void) +{ + memset(sparse_dump_physmap, 0, + roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE)); +} + +/* + * Include or exclude pages in a sparse dump, by half-open virtual + * address interval (which may wrap around the end of the space). + */ +void +sparse_dump_mark(vaddr_t vbegin, vaddr_t vend, int includep) +{ + pmap_t pmap; + paddr_t p; + vaddr_t v; + + /* + * If a partial page is called for, the whole page must be included. + */ + if (includep) { + vbegin = rounddown(vbegin, PAGE_SIZE); + vend = roundup(vend, PAGE_SIZE); + } else { + vbegin = roundup(vbegin, PAGE_SIZE); + vend = rounddown(vend, PAGE_SIZE); + } + + pmap = pmap_kernel(); + for (v = vbegin; v != vend; v += PAGE_SIZE) { + if (pmap_extract(pmap, v, &p)) { + if (includep) + setbit(sparse_dump_physmap, p/PAGE_SIZE); + else + clrbit(sparse_dump_physmap, p/PAGE_SIZE); + } + } +} + +/* + * Machine-dependently decides on the contents of a sparse dump, using + * the above. + */ +void +cpu_dump_prep_sparse(void) +{ + sparse_dump_reset(); + /* XXX could the alternate recursive page table be skipped? */ + sparse_dump_mark((vaddr_t)PTE_BASE, (vaddr_t)KERN_BASE, 1); + /* Memory for I/O buffers could be unmarked here, for example. */ + /* The kernel text could also be unmarked, but gdb would be upset. */ +} +#endif + +/* + * Abstractly iterate over the collection of memory segments to be + * dumped; the callback lacks the customary environment-pointer + * argument because none of the current users really need one. + * + * To be used only after dump_seg_prep is called to set things up. + */ +int +dump_seg_iter(int (*callback)(paddr_t, paddr_t)) +{ + int error, i; + +#define CALLBACK(start,size) do { \ + error = callback(start,size); \ + if (error) \ + return error; \ +} while(0) + + for (i = 0; i < mem_cluster_cnt; ++i) { +#ifndef NO_SPARSE_DUMP + /* + * The bitmap is scanned within each memory segment, + * rather than over its entire domain, in case any + * pages outside of the memory proper have been mapped + * into kva; they might be devices that wouldn't + * appreciate being arbitrarily read, and including + * them could also break the assumption that a sparse + * dump will always be smaller than a full one. + */ + if (sparse_dump) { + paddr_t p, start, end; + int lastset; + + start = mem_clusters[i].start; + end = start + mem_clusters[i].size; + start = rounddown(start, PAGE_SIZE); /* unnecessary? */ + lastset = 0; + for (p = start; p < end; p += PAGE_SIZE) { + int thisset = isset(sparse_dump_physmap, + p/PAGE_SIZE); + + if (!lastset && thisset) + start = p; + if (lastset && !thisset) + CALLBACK(start, p - start); + lastset = thisset; + } + if (lastset) + CALLBACK(start, p - start); + } else +#endif + CALLBACK(mem_clusters[i].start, mem_clusters[i].size); + } + return 0; +#undef CALLBACK +} + +/* + * Prepare for an impending core dump: decide what's being dumped and + * how much space it will take up. + */ +void +dump_seg_prep(void) +{ +#ifndef NO_SPARSE_DUMP + if (sparse_dump) + cpu_dump_prep_sparse(); +#endif + + dump_nmemsegs = 0; + dump_npages = 0; + dump_seg_iter(dump_seg_count_range); + + dump_header_size = ALIGN(sizeof(kcore_seg_t)) + + ALIGN(sizeof(cpu_kcore_hdr_t)) + + ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t)); + dump_header_size = roundup(dump_header_size, dbtob(1)); + + /* + * savecore(8) will read this to decide how many pages to + * copy, and cpu_dumpconf has already used the pessimistic + * value to set dumplo, so it's time to tell the truth. + */ + dumpsize = dump_npages; /* XXX could these just be one variable? */ +} + +int +dump_seg_count_range(paddr_t start, paddr_t size) +{ + ++dump_nmemsegs; + dump_npages += size / PAGE_SIZE; + return 0; +} + +/* + * A sparse dump's header may be rather large, due to the number of + * "segments" emitted. These routines manage a simple output buffer, + * so that the header can be written to disk incrementally. + */ +void +dump_header_start(void) +{ + dump_headerbuf_ptr = dump_headerbuf; + dump_header_blkno = dumplo; +} + +int +dump_header_flush(void) +{ + const struct bdevsw *bdev; + size_t to_write; + int error; + + bdev = bdevsw_lookup(dumpdev); + to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1)); + error = bdev->d_dump(dumpdev, dump_header_blkno, + dump_headerbuf, to_write); + dump_header_blkno += btodb(to_write); + dump_headerbuf_ptr = dump_headerbuf; + return error; +} + +int +dump_header_addbytes(const void* vptr, size_t n) +{ + const char* ptr = vptr; + int error; + + while (n > dump_headerbuf_avail) { + memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail); + ptr += dump_headerbuf_avail; + n -= dump_headerbuf_avail; + dump_headerbuf_ptr = dump_headerbuf_end; + error = dump_header_flush(); + if (error) + return error; + } + memcpy(dump_headerbuf_ptr, ptr, n); + dump_headerbuf_ptr += n; + + return 0; +} + +int +dump_header_addseg(paddr_t start, paddr_t size) +{ + phys_ram_seg_t seg = { start, size }; + + return dump_header_addbytes(&seg, sizeof(seg)); +} + +int +dump_header_finish(void) +{ + memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail); + return dump_header_flush(); +} + + /* * These variables are needed by /sbin/savecore */ @@ -754,7 +1058,8 @@ long dumplo = 0; /* blocks */ /* - * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers. + * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers + * for a full (non-sparse) dump. */ int cpu_dumpsize(void) @@ -770,7 +1075,8 @@ } /* - * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped. + * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped + * for a full (non-sparse) dump. */ u_long cpu_dump_mempagecnt(void) @@ -790,12 +1096,9 @@ cpu_dump(void) { int (*dump)(dev_t, daddr_t, void *, size_t); - char buf[dbtob(1)]; - kcore_seg_t *segp; - cpu_kcore_hdr_t *cpuhdrp; - phys_ram_seg_t *memsegp; + kcore_seg_t seg; + cpu_kcore_hdr_t cpuhdr; const struct bdevsw *bdev; - int i; bdev = bdevsw_lookup(dumpdev); if (bdev == NULL) @@ -803,79 +1106,24 @@ dump = bdev->d_dump; - memset(buf, 0, sizeof buf); - segp = (kcore_seg_t *)buf; - cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))]; - memsegp = (phys_ram_seg_t *)&buf[ ALIGN(sizeof(*segp)) + - ALIGN(sizeof(*cpuhdrp))]; - /* * Generate a segment header. */ - CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU); - segp->c_size = dbtob(1) - ALIGN(sizeof(*segp)); + CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU); + seg.c_size = dump_header_size - ALIGN(sizeof(seg)); + (void)dump_header_addbytes(&seg, ALIGN(sizeof(seg))); /* * Add the machine-dependent header info. */ - cpuhdrp->ptdpaddr = PDPpaddr; - cpuhdrp->nmemsegs = mem_cluster_cnt; + cpuhdr.ptdpaddr = PDPpaddr; + cpuhdr.nmemsegs = dump_nmemsegs; + (void)dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr))); /* - * Fill in the memory segment descriptors. + * Write out the memory segment descriptors. */ - for (i = 0; i < mem_cluster_cnt; i++) { - memsegp[i].start = mem_clusters[i].start; - memsegp[i].size = mem_clusters[i].size; - } - - return (dump(dumpdev, dumplo, (void *)buf, dbtob(1))); -} - -/* - * This is called by main to set dumplo and dumpsize. - * Dumps always skip the first PAGE_SIZE of disk space - * in case there might be a disk label stored there. - * If there is extra space, put dump at the end to - * reduce the chance that swapping trashes it. - */ -void -cpu_dumpconf(void) -{ - const struct bdevsw *bdev; - int nblks, dumpblks; /* size of dump area */ - - if (dumpdev == NODEV) - goto bad; - bdev = bdevsw_lookup(dumpdev); - if (bdev == NULL) { - dumpdev = NODEV; - goto bad; - } - if (bdev->d_psize == NULL) - goto bad; - nblks = (*bdev->d_psize)(dumpdev); - if (nblks <= ctod(1)) - goto bad; - - dumpblks = cpu_dumpsize(); - if (dumpblks < 0) - goto bad; - dumpblks += ctod(cpu_dump_mempagecnt()); - - /* If dump won't fit (incl. room for possible label), punt. */ - if (dumpblks > (nblks - ctod(1))) - goto bad; - - /* Put dump at end of partition */ - dumplo = nblks - dumpblks; - - /* dumpsize is in page units, and doesn't include headers. */ - dumpsize = cpu_dump_mempagecnt(); - return; - - bad: - dumpsize = 0; + return dump_seg_iter(dump_header_addseg); } /* @@ -894,23 +1142,70 @@ return (p + BYTES_PER_DUMP); } +int +dumpsys_seg(paddr_t maddr, paddr_t bytes) +{ + u_long i, m, n; + daddr_t blkno; + const struct bdevsw *bdev; + int (*dump)(dev_t, daddr_t, void *, size_t); + int error; + + if (dumpdev == NODEV) + return ENODEV; + bdev = bdevsw_lookup(dumpdev); + if (bdev == NULL || bdev->d_psize == NULL) + return ENODEV; + + dump = bdev->d_dump; + + blkno = dump_header_blkno; + for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) { + /* Print out how many MBs we have left to go. */ + if ((dump_totalbytesleft % (1024*1024)) == 0) + printf("%lu ", (unsigned long) + (dump_totalbytesleft / (1024 * 1024))); + + /* Limit size for next transfer. */ + n = bytes - i; + if (n > BYTES_PER_DUMP) + n = BYTES_PER_DUMP; + + for (m = 0; m < n; m += NBPG) + pmap_kenter_pa(dumpspace + m, maddr + m, + VM_PROT_READ, 0); + pmap_update(pmap_kernel()); + + error = (*dump)(dumpdev, blkno, (void *)dumpspace, n); + if (error) + return error; + maddr += n; + blkno += btodb(n); /* XXX? */ + +#if 0 /* XXX this doesn't work. grr. */ + /* operator aborting dump? */ + if (sget() != NULL) + return EINTR; +#endif + } + dump_header_blkno = blkno; + + return 0; +} + void dodumpsys(void) { const struct bdevsw *bdev; - u_long totalbytesleft, bytes, i, n, memseg; - u_long maddr; - int psize; - daddr_t blkno; - int (*dump)(dev_t, daddr_t, void *, size_t); + int dumpend, psize; int error; if (dumpdev == NODEV) return; + bdev = bdevsw_lookup(dumpdev); if (bdev == NULL || bdev->d_psize == NULL) return; - /* * For dumps during autoconfiguration, * if dump device has already configured... @@ -922,8 +1217,9 @@ minor(dumpdev)); return; } - printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev), - minor(dumpdev), dumplo); + printf("\ndumping to dev %llu,%llu offset %ld\n", + (unsigned long long)major(dumpdev), + (unsigned long long)minor(dumpdev), dumplo); psize = (*bdev->d_psize)(dumpdev); printf("dump "); @@ -932,48 +1228,43 @@ return; } +#if 0 /* XXX this doesn't work. grr. */ + /* toss any characters present prior to dump */ + while (sget() != NULL); /*syscons and pccons differ */ +#endif + + dump_seg_prep(); + dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages); + if (dumpend > psize) { + printf("failed: insufficient space (%d < %d)\n", + psize, dumpend); + goto failed; + } + + dump_header_start(); if ((error = cpu_dump()) != 0) goto err; + if ((error = dump_header_finish()) != 0) + goto err; - totalbytesleft = ctob(cpu_dump_mempagecnt()); - blkno = dumplo + cpu_dumpsize(); - dump = bdev->d_dump; - error = 0; + if (dump_header_blkno != dumplo + btodb(dump_header_size)) { + printf("BAD header size (%ld [written] != %ld [expected])\n", + (long)(dump_header_blkno - dumplo), + (long)btodb(dump_header_size)); + goto failed; + } - for (memseg = 0; memseg < mem_cluster_cnt; memseg++) { - maddr = mem_clusters[memseg].start; - bytes = mem_clusters[memseg].size; - - for (i = 0; i < bytes; i += n, totalbytesleft -= n) { - /* Print out how many MBs we have left to go. */ - if ((totalbytesleft % (1024*1024)) == 0) - printf("%ld ", totalbytesleft / (1024 * 1024)); - - /* Limit size for next transfer. */ - n = bytes - i; - if (n > BYTES_PER_DUMP) - n = BYTES_PER_DUMP; - - (void) pmap_map(dumpspace, maddr, maddr + n, - VM_PROT_READ); - - error = (*dump)(dumpdev, blkno, (void *)dumpspace, n); - if (error) - goto err; - maddr += n; - blkno += btodb(n); /* XXX? */ + dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP); + error = dump_seg_iter(dumpsys_seg); -#if 0 /* XXX this doesn't work. grr. */ - /* operator aborting dump? */ - if (sget() != NULL) { - error = EINTR; - break; - } -#endif - } + if (error == 0 && dump_header_blkno != dumpend) { + printf("BAD dump size (%ld [written] != %ld [expected])\n", + (long)(dumpend - dumplo), + (long)(dump_header_blkno - dumplo)); + goto failed; } - err: +err: switch (error) { case ENXIO: @@ -1004,11 +1295,76 @@ printf("error %d\n", error); break; } +failed: printf("\n\n"); delay(5000000); /* 5 seconds */ } /* + * This is called by main to set dumplo and dumpsize. + * Dumps always skip the first PAGE_SIZE of disk space + * in case there might be a disk label stored there. + * If there is extra space, put dump at the end to + * reduce the chance that swapping trashes it. + * + * Sparse dumps can't placed as close to the end as possible, because + * savecore(8) has to know where to start reading in the dump device + * before it has access to any of the crashed system's state. + * + * Note also that a sparse dump will never be larger than a full one: + * in order to add a phys_ram_seg_t to the header, at least one page + * must be removed. + */ +void +cpu_dumpconf(void) +{ + const struct bdevsw *bdev; + int nblks, dumpblks; /* size of dump area */ + + if (dumpdev == NODEV) + goto bad; + bdev = bdevsw_lookup(dumpdev); + if (bdev == NULL) { + dumpdev = NODEV; + goto bad; + } + if (bdev->d_psize == NULL) + goto bad; + nblks = (*bdev->d_psize)(dumpdev); + if (nblks <= ctod(1)) + goto bad; + + dumpblks = cpu_dumpsize(); + if (dumpblks < 0) + goto bad; + dumpblks += ctod(cpu_dump_mempagecnt()); + + /* If dump won't fit (incl. room for possible label), punt. */ + if (dumpblks > (nblks - ctod(1))) { +#ifndef NO_SPARSE_DUMP + /* A sparse dump might (and hopefully will) fit. */ + dumplo = ctod(1); +#else + /* But if we're not configured for that, punt. */ + goto bad; +#endif + } else { + /* Put dump at end of partition */ + dumplo = nblks - dumpblks; + } + + /* dumpsize is in page units, and doesn't include headers. */ + dumpsize = cpu_dump_mempagecnt(); + + /* Now that we've decided this will work, init ancillary stuff. */ + dump_misc_init(); + return; + + bad: + dumpsize = 0; +} + +/* * Clear registers on exec */ void Index: src/sys/arch/amd64/include/pmap.h diff -u src/sys/arch/amd64/include/pmap.h:1.25 src/sys/arch/amd64/include/pmap.h:1.26 --- src/sys/arch/amd64/include/pmap.h:1.25 Sat Aug 13 08:09:38 2011 +++ src/sys/arch/amd64/include/pmap.h Sat Aug 27 12:23:44 2011 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.h,v 1.25 2011/08/13 12:09:38 cherry Exp $ */ +/* $NetBSD: pmap.h,v 1.26 2011/08/27 16:23:44 christos Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -177,6 +177,7 @@ */ #define PTE_BASE ((pt_entry_t *) (L4_SLOT_PTE * NBPD_L4)) +#define KERN_BASE ((pt_entry_t *) (L4_SLOT_KERN * NBPD_L4)) #define APTE_BASE ((pt_entry_t *) (VA_SIGN_NEG((L4_SLOT_APTE * NBPD_L4)))) #define L1_BASE PTE_BASE