Module Name:    src
Committed By:   ad
Date:           Sun Apr 19 14:11:38 UTC 2009

Modified Files:
        src/sys/arch/amd64/include: types.h
        src/sys/arch/i386/include: types.h
        src/sys/arch/x86/include: cpu.h intr.h pic.h
        src/sys/arch/x86/isa: isa_machdep.c
        src/sys/arch/x86/x86: idt.c intr.c ioapic.c
        src/sys/kern: kern_cpu.c
        src/sys/sys: cpu.h cpuio.h
        src/usr.sbin/cpuctl: cpuctl.c
Removed Files:
        src/sys/compat/sys: cpuio.h

Log Message:
cpuctl:

- Add interrupt shielding (direct hardware interrupts away from the
  specified CPUs). Not documented just yet but will be soon.

- Redo /dev/cpu time_t compat so no kernel changes are needed.

x86:

- Make intr_establish, intr_disestablish safe to use when !cold.

- Distribute hardware interrupts among the CPUs, instead of directing
  everything to the boot CPU.

- Add MD code for interrupt sheilding. This works in most cases but there is
  a bug where delivery is not accepted by an LAPIC after redistribution. It
  also needs re-balancing to make things fair after interrupts are turned
  back on for a CPU.


To generate a diff of this commit:
cvs rdiff -u -r1.31 -r1.32 src/sys/arch/amd64/include/types.h
cvs rdiff -u -r1.64 -r1.65 src/sys/arch/i386/include/types.h
cvs rdiff -u -r1.15 -r1.16 src/sys/arch/x86/include/cpu.h
cvs rdiff -u -r1.38 -r1.39 src/sys/arch/x86/include/intr.h
cvs rdiff -u -r1.6 -r1.7 src/sys/arch/x86/include/pic.h
cvs rdiff -u -r1.25 -r1.26 src/sys/arch/x86/isa/isa_machdep.c
cvs rdiff -u -r1.2 -r1.3 src/sys/arch/x86/x86/idt.c
cvs rdiff -u -r1.60 -r1.61 src/sys/arch/x86/x86/intr.c
cvs rdiff -u -r1.39 -r1.40 src/sys/arch/x86/x86/ioapic.c
cvs rdiff -u -r1.2 -r0 src/sys/compat/sys/cpuio.h
cvs rdiff -u -r1.41 -r1.42 src/sys/kern/kern_cpu.c
cvs rdiff -u -r1.29 -r1.30 src/sys/sys/cpu.h
cvs rdiff -u -r1.3 -r1.4 src/sys/sys/cpuio.h
cvs rdiff -u -r1.13 -r1.14 src/usr.sbin/cpuctl/cpuctl.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/amd64/include/types.h
diff -u src/sys/arch/amd64/include/types.h:1.31 src/sys/arch/amd64/include/types.h:1.32
--- src/sys/arch/amd64/include/types.h:1.31	Sun Apr  5 00:57:56 2009
+++ src/sys/arch/amd64/include/types.h	Sun Apr 19 14:11:36 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: types.h,v 1.31 2009/04/05 00:57:56 tsutsui Exp $	*/
+/*	$NetBSD: types.h,v 1.32 2009/04/19 14:11:36 ad Exp $	*/
 
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
@@ -76,6 +76,7 @@
 #define	__HAVE_ATOMIC64_OPS
 #define	__HAVE_ATOMIC_AS_MEMBAR
 #define	__HAVE_CPU_LWP_SETPRIVATE
+#define	__HAVE_INTR_CONTROL
 
 #ifdef _KERNEL_OPT
 #include "opt_xen.h"

Index: src/sys/arch/i386/include/types.h
diff -u src/sys/arch/i386/include/types.h:1.64 src/sys/arch/i386/include/types.h:1.65
--- src/sys/arch/i386/include/types.h:1.64	Sun Apr  5 00:57:56 2009
+++ src/sys/arch/i386/include/types.h	Sun Apr 19 14:11:37 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: types.h,v 1.64 2009/04/05 00:57:56 tsutsui Exp $	*/
+/*	$NetBSD: types.h,v 1.65 2009/04/19 14:11:37 ad Exp $	*/
 
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
@@ -84,6 +84,7 @@
 #define	__HAVE_ATOMIC64_OPS
 #define	__HAVE_ATOMIC_AS_MEMBAR
 #define	__HAVE_CPU_LWP_SETPRIVATE
+#define	__HAVE_INTR_CONTROL
 
 #if defined(_KERNEL)
 #define	__HAVE_RAS

Index: src/sys/arch/x86/include/cpu.h
diff -u src/sys/arch/x86/include/cpu.h:1.15 src/sys/arch/x86/include/cpu.h:1.16
--- src/sys/arch/x86/include/cpu.h:1.15	Thu Apr 16 15:34:23 2009
+++ src/sys/arch/x86/include/cpu.h	Sun Apr 19 14:11:37 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.15 2009/04/16 15:34:23 rmind Exp $	*/
+/*	$NetBSD: cpu.h,v 1.16 2009/04/19 14:11:37 ad Exp $	*/
 
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
@@ -111,6 +111,7 @@
 #define	TLBSTATE_LAZY	1	/* tlbs are valid but won't be kept uptodate */
 #define	TLBSTATE_STALE	2	/* we might have stale user tlbs */
 	int ci_curldt;		/* current LDT descriptor */
+	int ci_nintrhand;	/* number of H/W interrupt handlers */
 	uint64_t ci_scratch;
 
 #ifdef XEN

Index: src/sys/arch/x86/include/intr.h
diff -u src/sys/arch/x86/include/intr.h:1.38 src/sys/arch/x86/include/intr.h:1.39
--- src/sys/arch/x86/include/intr.h:1.38	Fri Mar 27 16:09:24 2009
+++ src/sys/arch/x86/include/intr.h	Sun Apr 19 14:11:37 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: intr.h,v 1.38 2009/03/27 16:09:24 dyoung Exp $	*/
+/*	$NetBSD: intr.h,v 1.39 2009/04/19 14:11:37 ad Exp $	*/
 
 /*-
  * Copyright (c) 1998, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -103,6 +103,7 @@
 	int	(*ih_realfun)(void *);
 	void	*ih_realarg;
 	struct	intrhand *ih_next;
+	struct	intrhand **ih_prevp;
 	int	ih_pin;
 	int	ih_slot;
 	struct cpu_info *ih_cpu;
@@ -176,9 +177,7 @@
 void cpu_intr_init(struct cpu_info *);
 int intr_find_mpmapping(int, int, int *);
 struct pic *intr_findpic(int);
-#ifdef INTRDEBUG
 void intr_printconfig(void);
-#endif
 
 int x86_send_ipi(struct cpu_info *, int);
 void x86_broadcast_ipi(int);

Index: src/sys/arch/x86/include/pic.h
diff -u src/sys/arch/x86/include/pic.h:1.6 src/sys/arch/x86/include/pic.h:1.7
--- src/sys/arch/x86/include/pic.h:1.6	Thu Apr  2 00:09:32 2009
+++ src/sys/arch/x86/include/pic.h	Sun Apr 19 14:11:37 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: pic.h,v 1.6 2009/04/02 00:09:32 dyoung Exp $	*/
+/*	$NetBSD: pic.h,v 1.7 2009/04/19 14:11:37 ad Exp $	*/
 
 #ifndef _X86_PIC_H
 #define _X86_PIC_H
@@ -18,6 +18,7 @@
 	void (*pic_hwunmask)(struct pic *, int);
 	void (*pic_addroute)(struct pic *, struct cpu_info *, int, int, int);
 	void (*pic_delroute)(struct pic *, struct cpu_info *, int, int, int);
+	bool (*pic_trymask)(struct pic *, int);
 	struct intrstub *pic_level_stubs;
 	struct intrstub *pic_edge_stubs;
 	struct ioapic_softc *pic_ioapic; /* if pic_type == PIC_IOAPIC */

Index: src/sys/arch/x86/isa/isa_machdep.c
diff -u src/sys/arch/x86/isa/isa_machdep.c:1.25 src/sys/arch/x86/isa/isa_machdep.c:1.26
--- src/sys/arch/x86/isa/isa_machdep.c:1.25	Sat Mar 14 14:46:08 2009
+++ src/sys/arch/x86/isa/isa_machdep.c	Sun Apr 19 14:11:37 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: isa_machdep.c,v 1.25 2009/03/14 14:46:08 dsl Exp $	*/
+/*	$NetBSD: isa_machdep.c,v 1.26 2009/04/19 14:11:37 ad Exp $	*/
 
 /*-
  * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: isa_machdep.c,v 1.25 2009/03/14 14:46:08 dsl Exp $");
+__KERNEL_RCSID(0, "$NetBSD: isa_machdep.c,v 1.26 2009/04/19 14:11:37 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -74,10 +74,10 @@
 #include <sys/device.h>
 #include <sys/proc.h>
 #include <sys/mbuf.h>
+#include <sys/bus.h>
+#include <sys/cpu.h>
 
-#include <machine/bus.h>
 #include <machine/bus_private.h>
-
 #include <machine/pio.h>
 #include <machine/cpufunc.h>
 
@@ -127,7 +127,6 @@
 int
 isa_intr_alloc(isa_chipset_tag_t ic, int mask, int type, int *irq)
 {
-	extern kmutex_t x86_intr_lock;
 	int i, tmp, bestirq, count;
 	struct intrhand **p, *q;
 	struct intrsource *isp;
@@ -150,7 +149,7 @@
 	 */
 	mask &= 0xefbf;
 
-	mutex_enter(&x86_intr_lock);
+	mutex_enter(&cpu_lock);
 
 	for (i = 0; i < NUM_LEGACY_IRQS; i++) {
 		if (LEGAL_IRQ(i) == 0 || (mask & (1<<i)) == 0)
@@ -161,7 +160,7 @@
 			 * if nothing's using the irq, just return it
 			 */
 			*irq = i;
-			mutex_exit(&x86_intr_lock);
+			mutex_exit(&cpu_lock);
 			return (0);
 		}
 
@@ -194,7 +193,7 @@
 		}
 	}
 
-	mutex_exit(&x86_intr_lock);
+	mutex_exit(&cpu_lock);
 
 	if (bestirq == -1)
 		return (1);

Index: src/sys/arch/x86/x86/idt.c
diff -u src/sys/arch/x86/x86/idt.c:1.2 src/sys/arch/x86/x86/idt.c:1.3
--- src/sys/arch/x86/x86/idt.c:1.2	Mon Apr 28 20:23:40 2008
+++ src/sys/arch/x86/x86/idt.c	Sun Apr 19 14:11:37 2009
@@ -1,12 +1,12 @@
-/*	$NetBSD: idt.c,v 1.2 2008/04/28 20:23:40 martin Exp $	*/
+/*	$NetBSD: idt.c,v 1.3 2009/04/19 14:11:37 ad Exp $	*/
 
 /*-
- * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1996, 1997, 1998, 2000, 2009 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
- * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
- * Simulation Facility, NASA Ames Research Center.
+ * by Charles M. Hannum, by Jason R. Thorpe of the Numerical Aerospace
+ * Simulation Facility NASA Ames Research Center, and by Andrew Doran.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -65,46 +65,51 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: idt.c,v 1.2 2008/04/28 20:23:40 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: idt.c,v 1.3 2009/04/19 14:11:37 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mutex.h>
+#include <sys/cpu.h>
+#include <sys/atomic.h>
 
 #include <machine/segments.h>
 
 #if !defined(XEN)
 
-static kmutex_t idt_lock;
 struct gate_descriptor *idt;
 static char idt_allocmap[NIDT];
 
 /*
  * Allocate an IDT vector slot within the given range.
+ * cpu_lock will be held unless single threaded during early boot.
  */
-
 int
 idt_vec_alloc(int low, int high)
 {
 	int vec;
 
-	mutex_enter(&idt_lock);
+	KASSERT(mutex_owned(&cpu_lock) || !mp_online);
+
 	for (vec = low; vec <= high; vec++) {
 		if (idt_allocmap[vec] == 0) {
+			/* idt_vec_free() can be unlocked, so membar. */
+			membar_sync();
 			idt_allocmap[vec] = 1;
-			mutex_exit(&idt_lock);
 			return vec;
 		}
 	}
-	mutex_exit(&idt_lock);
 	return 0;
 }
 
 void
 idt_vec_reserve(int vec)
 {
-	int result = idt_vec_alloc(vec, vec);
+	int result;
+
+	KASSERT(mutex_owned(&cpu_lock) || !mp_online);
 
+	result = idt_vec_alloc(vec, vec);
 	if (result != vec) {
 		panic("%s: failed to reserve vec %d", __func__, vec);
 	}
@@ -113,30 +118,28 @@
 void
 idt_vec_set(int vec, void (*function)(void))
 {
-	/*
-	 * Vector should be allocated, so no locking needed.
-	 */
 
+	KASSERT(mutex_owned(&cpu_lock) || !mp_online);
 	KASSERT(idt_allocmap[vec] == 1);
 	setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 }
 
+/*
+ * Free IDT vector.  No locking required as release is atomic.
+ */
 void
 idt_vec_free(int vec)
 {
 
-	mutex_enter(&idt_lock);
 	unsetgate(&idt[vec]);
 	idt_allocmap[vec] = 0;
-	mutex_exit(&idt_lock);
 }
 
 void
 idt_init(void)
 {
 
-	mutex_init(&idt_lock, MUTEX_DEFAULT, IPL_NONE);
 }
 
 #endif /* !defined(XEN) */

Index: src/sys/arch/x86/x86/intr.c
diff -u src/sys/arch/x86/x86/intr.c:1.60 src/sys/arch/x86/x86/intr.c:1.61
--- src/sys/arch/x86/x86/intr.c:1.60	Tue Apr  7 18:24:23 2009
+++ src/sys/arch/x86/x86/intr.c	Sun Apr 19 14:11:37 2009
@@ -1,7 +1,7 @@
-/*	$NetBSD: intr.c,v 1.60 2009/04/07 18:24:23 dyoung Exp $	*/
+/*	$NetBSD: intr.c,v 1.61 2009/04/19 14:11:37 ad Exp $	*/
 
 /*-
- * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -133,7 +133,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: intr.c,v 1.60 2009/04/07 18:24:23 dyoung Exp $");
+__KERNEL_RCSID(0, "$NetBSD: intr.c,v 1.61 2009/04/19 14:11:37 ad Exp $");
 
 #include "opt_intrdebug.h"
 #include "opt_multiprocessor.h"
@@ -144,7 +144,6 @@
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/device.h>
-#include <sys/malloc.h>
 #include <sys/kmem.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
@@ -192,8 +191,6 @@
 #endif
 #endif
 
-kmutex_t x86_intr_lock;
-
 /*
  * Fill in default interrupt table (in case of spurious interrupt
  * during configuration of kernel), setup interrupt control unit
@@ -203,8 +200,6 @@
 {
 	int i;
 
-	mutex_init(&x86_intr_lock, MUTEX_DEFAULT, IPL_NONE);
-
 	/* icu vectors */
 	for (i = 0; i < NUM_LEGACY_IRQS; i++) {
 		idt_vec_reserve(ICU_OFFSET + i);
@@ -226,12 +221,22 @@
 int
 x86_nmi(void)
 {
+
 	log(LOG_CRIT, "NMI port 61 %x, port 70 %x\n", inb(0x61), inb(0x70));
 	return(0);
 }
 
 /*
  * Recalculate the interrupt masks from scratch.
+ * During early boot, anything goes and we are always called on the BP.
+ * When the system is up and running:
+ *
+ * => called with ci == curcpu()
+ * => cpu_lock held by the initiator
+ * => interrupts disabled on-chip (PSL_I)
+ *
+ * Do not call printf(), kmem_free() or other "heavyweight" routines
+ * from here.  This routine must be quick and must not block.
  */
 static void
 intr_calculatemasks(struct cpu_info *ci)
@@ -311,7 +316,7 @@
 {
 	struct intr_extra_bus *iebp;
 
-	iebp = malloc(sizeof(struct intr_extra_bus), M_TEMP, M_WAITOK);
+	iebp = kmem_alloc(sizeof(*iebp), KM_SLEEP);
 	iebp->bus = pba->pba_bus;
 	iebp->pci_chipset_tag = pba->pba_pc;
 	iebp->pci_bridge_tag = pba->pba_bridgetag;
@@ -350,10 +355,6 @@
 }
 #endif
 
-
-/*
- * XXX if defined(MULTIPROCESSOR) && .. ?
- */
 #if NIOAPIC > 0 || NACPI > 0
 int
 intr_find_mpmapping(int bus, int pin, int *handle)
@@ -416,15 +417,14 @@
 	int slot, i;
 	struct intrsource *isp;
 
+	KASSERT(mutex_owned(&cpu_lock));
+
 	if (pic == &i8259_pic) {
-		if (!CPU_IS_PRIMARY(ci))
-			return EBUSY;
+		KASSERT(CPU_IS_PRIMARY(ci));
 		slot = pin;
-		mutex_enter(&x86_intr_lock);
 	} else {
 		slot = -1;
 
-		mutex_enter(&x86_intr_lock);
 		/*
 		 * intr_allocate_slot has checked for an existing mapping.
 		 * Now look for a free slot.
@@ -436,17 +436,14 @@
 			}
 		}
 		if (slot == -1) {
-			mutex_exit(&x86_intr_lock);
 			return EBUSY;
 		}
 	}
 
 	isp = ci->ci_isources[slot];
 	if (isp == NULL) {
-		isp = malloc(sizeof (struct intrsource),
-		    M_DEVBUF, M_NOWAIT|M_ZERO);
+		isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
 		if (isp == NULL) {
-			mutex_exit(&x86_intr_lock);
 			return ENOMEM;
 		}
 		snprintf(isp->is_evname, sizeof (isp->is_evname),
@@ -455,7 +452,6 @@
 		    pic->pic_name, isp->is_evname);
 		ci->ci_isources[slot] = isp;
 	}
-	mutex_exit(&x86_intr_lock);
 
 	*index = slot;
 	return 0;
@@ -464,20 +460,23 @@
 /*
  * A simple round-robin allocator to assign interrupts to CPUs.
  */
-static int
+static int __noinline
 intr_allocate_slot(struct pic *pic, int pin, int level,
 		   struct cpu_info **cip, int *index, int *idt_slot)
 {
 	CPU_INFO_ITERATOR cii;
-	struct cpu_info *ci;
+	struct cpu_info *ci, *lci;
 	struct intrsource *isp;
 	int slot, idtvec, error;
 
+	KASSERT(mutex_owned(&cpu_lock));
+
 	/* First check if this pin is already used by an interrupt vector. */
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		for (slot = 0 ; slot < MAX_INTR_SOURCES ; slot++) {
-			if ((isp = ci->ci_isources[slot]) == NULL)
+			if ((isp = ci->ci_isources[slot]) == NULL) {
 				continue;
+			}
 			if (isp->is_pic == pic && isp->is_pin == pin) {
 				*idt_slot = isp->is_idtvec;
 				*index = slot;
@@ -489,56 +488,73 @@
 
 	/*
 	 * The pic/pin combination doesn't have an existing mapping.
-	 * Find a slot for a new interrupt source and allocate an IDT
-	 * vector.
-	 *
-	 * For the i8259 case, this always uses the reserved slots
-	 * of the primary CPU and fixed IDT vectors.  This is required
-	 * by other parts of the code, see x86/intr.h for more details.
-	 *
-	 * For the IOAPIC case, interrupts are assigned to the
-	 * primary CPU by default, until it runs out of slots.
+	 * Find a slot for a new interrupt source.  For the i8259 case,
+	 * we always use reserved slots of the primary CPU.  Otherwise,
+	 * we make an attempt to balance the interrupt load.
 	 *
 	 * PIC and APIC usage are essentially exclusive, so the reservation
 	 * of the ISA slots is ignored when assigning IOAPIC slots.
-	 *
-	 * XXX Fix interrupt allocation to Application Processors.
-	 * XXX Check how many interrupts each CPU got and assign it to
-	 * XXX the least loaded CPU.  Consider adding options to bind
-	 * XXX interrupts to specific CPUs.
-	 * XXX Drop apic level support, just assign IDT vectors sequentially.
 	 */
-	ci = &cpu_info_primary;
-	error = intr_allocate_slot_cpu(ci, pic, pin, &slot);
-	if (error != 0) {
+	if (pic == &i8259_pic) {
+		/*
+		 * Must be directed to BP.
+		 */
+		ci = &cpu_info_primary;
+		error = intr_allocate_slot_cpu(ci, pic, pin, &slot);
+	} else {
 		/*
-		 * ..now try the others.
+		 * Find least loaded AP/BP and try to allocate there.
 		 */
-		for (CPU_INFO_FOREACH(cii, ci)) {
-			if (CPU_IS_PRIMARY(ci))
+		ci = NULL;
+		for (CPU_INFO_FOREACH(cii, lci)) {
+			if ((lci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
 				continue;
-			error = intr_allocate_slot_cpu(ci, pic, pin, &slot);
-			if (error == 0)
-				break;
+			}
+			if (ci == NULL ||
+			    ci->ci_nintrhand > lci->ci_nintrhand) {
+			    	ci = lci;
+			}
+		}
+		KASSERT(ci != NULL);
+		error = intr_allocate_slot_cpu(ci, pic, pin, &slot);
+
+		/*
+		 * If that did not work, allocate anywhere.
+		 */
+		if (error != 0) {
+			for (CPU_INFO_FOREACH(cii, ci)) {
+				if ((ci->ci_schedstate.spc_flags &
+				    SPCF_NOINTR) != 0) {
+					continue;
+				}
+				error = intr_allocate_slot_cpu(ci, pic,
+				    pin, &slot);
+				if (error == 0) {
+					break;
+				}
+			}
 		}
-		if (error != 0)
-			return EBUSY;
 	}
+	if (error != 0) {
+		return error;
+	}
+	KASSERT(ci != NULL);
 
-	if (pic == &i8259_pic)
+	/* 
+	 * Now allocate an IDT vector.
+	 * For the 8259 these are reserved up front.
+	 */
+	if (pic == &i8259_pic) {
 		idtvec = ICU_OFFSET + pin;
-	else
+	} else {
 		idtvec = idt_vec_alloc(APIC_LEVEL(level), IDT_INTR_HIGH);
-
+	}
 	if (idtvec == 0) {
-		mutex_enter(&x86_intr_lock);
 		evcnt_detach(&ci->ci_isources[slot]->is_evcnt);
-		free(ci->ci_isources[slot], M_DEVBUF);
+		kmem_free(ci->ci_isources[slot], sizeof(*(ci->ci_isources[slot])));
 		ci->ci_isources[slot] = NULL;
-		mutex_exit(&x86_intr_lock);
 		return EBUSY;
 	}
-
 	ci->ci_isources[slot]->is_idtvec = idtvec;
 	*idt_slot = idtvec;
 	*index = slot;
@@ -546,6 +562,23 @@
 	return 0;
 }
 
+static void
+intr_source_free(struct cpu_info *ci, int slot, struct pic *pic, int idtvec)
+{
+	struct intrsource *isp;
+
+	isp = ci->ci_isources[slot];
+
+	if (isp->is_handlers != NULL)
+		return;
+	ci->ci_isources[slot] = NULL;
+	evcnt_detach(&isp->is_evcnt);
+	kmem_free(isp, sizeof(*isp));
+	ci->ci_isources[slot] = NULL;
+	if (pic != &i8259_pic)
+		idt_vec_free(idtvec);
+}
+
 #ifdef MULTIPROCESSOR
 static int intr_biglock_wrapper(void *);
 
@@ -585,6 +618,59 @@
 	return NULL;
 }
 
+/*
+ * Handle per-CPU component of interrupt establish.
+ *
+ * => caller (on initiating CPU) holds cpu_lock on our behalf
+ * => arg1: struct intrhand *ih
+ * => arg2: int idt_vec
+ */
+static void
+intr_establish_xcall(void *arg1, void *arg2)
+{
+	struct intrsource *source;
+	struct intrstub *stubp;
+	struct intrhand *ih;
+	struct cpu_info *ci;
+	int idt_vec;
+	u_long psl;
+
+	ih = arg1;
+
+	KASSERT(ih->ih_cpu == curcpu() || !mp_online);
+
+	ci = ih->ih_cpu;
+	source = ci->ci_isources[ih->ih_slot];
+	idt_vec = (int)(intptr_t)arg2;
+
+	/* Disable interrupts locally. */
+	psl = x86_read_psl();
+	x86_disable_intr();
+
+	/* Link in the handler and re-calculate masks. */
+	*(ih->ih_prevp) = ih;
+	intr_calculatemasks(ci);
+
+	/* Hook in new IDT vector and SPL state. */
+	if (source->is_resume == NULL || source->is_idtvec != idt_vec) {
+		if (source->is_idtvec != 0 && source->is_idtvec != idt_vec)
+			idt_vec_free(source->is_idtvec);
+		source->is_idtvec = idt_vec;
+		if (source->is_type == IST_LEVEL) {
+			stubp = &source->is_pic->pic_level_stubs[ih->ih_slot];
+		} else {
+			stubp = &source->is_pic->pic_edge_stubs[ih->ih_slot];
+		}
+		source->is_resume = stubp->ist_resume;
+		source->is_recurse = stubp->ist_recurse;
+		setgate(&idt[idt_vec], stubp->ist_entry, 0, SDT_SYS386IGT,
+		    SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	}
+
+	/* Re-enable interrupts locally. */
+	x86_write_psl(psl);
+}
+
 void *
 intr_establish(int legacy_irq, struct pic *pic, int pin, int type, int level,
 	       int (*handler)(void *), void *arg, bool known_mpsafe)
@@ -593,10 +679,10 @@
 	struct cpu_info *ci;
 	int slot, error, idt_vec;
 	struct intrsource *source;
-	struct intrstub *stubp;
 #ifdef MULTIPROCESSOR
 	bool mpsafe = (known_mpsafe || level != IPL_VM);
 #endif /* MULTIPROCESSOR */
+	uint64_t where;
 
 #ifdef DIAGNOSTIC
 	if (legacy_irq != -1 && (legacy_irq < 0 || legacy_irq > 15))
@@ -606,18 +692,19 @@
 		panic("intr_establish: non-legacy IRQ on i8259");
 #endif
 
-	error = intr_allocate_slot(pic, pin, level, &ci, &slot,
-	    &idt_vec);
-	if (error != 0) {
-		printf("failed to allocate interrupt slot for PIC %s pin %d\n",
-		    pic->pic_name, pin);
+	ih = kmem_alloc(sizeof(*ih), KM_SLEEP);
+	if (ih == NULL) {
+		printf("intr_establish: can't allocate handler info\n");
 		return NULL;
 	}
 
-	/* no point in sleeping unless someone can free memory. */
-	ih = malloc(sizeof *ih, M_DEVBUF, cold ? M_NOWAIT : M_WAITOK);
-	if (ih == NULL) {
-		printf("intr_establish: can't allocate malloc handler info\n");
+	mutex_enter(&cpu_lock);
+	error = intr_allocate_slot(pic, pin, level, &ci, &slot, &idt_vec);
+	if (error != 0) {
+		mutex_exit(&cpu_lock);
+		kmem_free(ih, sizeof(*ih));
+		printf("failed to allocate interrupt slot for PIC %s pin %d\n",
+		    pic->pic_name, pin);
 		return NULL;
 	}
 
@@ -625,15 +712,14 @@
 
 	if (source->is_handlers != NULL &&
 	    source->is_pic->pic_type != pic->pic_type) {
-		free(ih, M_DEVBUF);
+		mutex_exit(&cpu_lock);
+		kmem_free(ih, sizeof(*ih));
 		printf("intr_establish: can't share intr source between "
 		       "different PIC types (legacy_irq %d pin %d slot %d)\n",
 		    legacy_irq, pin, slot);
 		return NULL;
 	}
 
-	mutex_enter(&x86_intr_lock);
-
 	source->is_pin = pin;
 	source->is_pic = pic;
 
@@ -645,23 +731,30 @@
 	case IST_LEVEL:
 		if (source->is_type == type)
 			break;
+		/* FALLTHROUGH */
 	case IST_PULSE:
 		if (type != IST_NONE) {
-			mutex_exit(&x86_intr_lock);
+			mutex_exit(&cpu_lock);
+			kmem_free(ih, sizeof(*ih));
+			intr_source_free(ci, slot, pic, idt_vec);
 			printf("intr_establish: pic %s pin %d: can't share "
 			       "type %d with %d\n", pic->pic_name, pin,
 				source->is_type, type);
-			free(ih, M_DEVBUF);
 			return NULL;
 		}
 		break;
 	default:
-		mutex_exit(&x86_intr_lock);
 		panic("intr_establish: bad intr type %d for pic %s pin %d\n",
 		    source->is_type, pic->pic_name, pin);
+		/* NOTREACHED */
 	}
 
-	pic->pic_hwmask(pic, pin);
+	/*
+	 * We're now committed.  Mask the interrupt in hardware and
+	 * count it for load distribution.
+	 */
+	(*pic->pic_hwmask)(pic, pin);
+	(ci->ci_nintrhand)++;
 
 	/*
 	 * Figure out where to put the handler.
@@ -670,11 +763,13 @@
 	 */
 	for (p = &ci->ci_isources[slot]->is_handlers;
 	     (q = *p) != NULL && q->ih_level > level;
-	     p = &q->ih_next)
-		;
+	     p = &q->ih_next) {
+		/* nothing */;
+	}
 
 	ih->ih_fun = ih->ih_realfun = handler;
 	ih->ih_arg = ih->ih_realarg = arg;
+	ih->ih_prevp = p;
 	ih->ih_next = *p;
 	ih->ih_level = level;
 	ih->ih_pin = pin;
@@ -686,27 +781,23 @@
 		ih->ih_arg = ih;
 	}
 #endif /* MULTIPROCESSOR */
-	*p = ih;
-
-	intr_calculatemasks(ci);
 
-	mutex_exit(&x86_intr_lock);
-
-	if (source->is_resume == NULL || source->is_idtvec != idt_vec) {
-		if (source->is_idtvec != 0 && source->is_idtvec != idt_vec)
-			idt_vec_free(source->is_idtvec);
-		source->is_idtvec = idt_vec;
-		stubp = type == IST_LEVEL ?
-		    &pic->pic_level_stubs[slot] : &pic->pic_edge_stubs[slot];
-		source->is_resume = stubp->ist_resume;
-		source->is_recurse = stubp->ist_recurse;
-		setgate(&idt[idt_vec], stubp->ist_entry, 0, SDT_SYS386IGT,
-		    SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+	/*
+	 * Call out to the remote CPU to update its interrupt state.
+	 * Only make RPCs if the APs are up and running.
+	 */
+	if (ci == curcpu() || !mp_online) {
+		intr_establish_xcall(ih, (void *)(intptr_t)idt_vec);
+	} else {
+		where = xc_unicast(0, intr_establish_xcall, ih,
+		    (void *)(intptr_t)idt_vec, ci);
+		xc_wait(where);
 	}
 
-	pic->pic_addroute(pic, ci, pin, idt_vec, type);
-
-	pic->pic_hwunmask(pic, pin);
+	/* All set up, so add a route for the interrupt and unmask it. */
+	(*pic->pic_addroute)(pic, ci, pin, idt_vec, type);
+	(*pic->pic_hwunmask)(pic, pin);
+	mutex_exit(&cpu_lock);
 
 #ifdef INTRDEBUG
 	printf("allocated pic %s type %s pin %d level %d to %s slot %d "
@@ -719,24 +810,37 @@
 }
 
 /*
- * Deregister an interrupt handler.
+ * Called on bound CPU to handle intr_disestablish().
+ *
+ * => caller (on initiating CPU) holds cpu_lock on our behalf
+ * => arg1: struct intrhand *ih
+ * => arg2: unused
  */
-void
-intr_disestablish(struct intrhand *ih)
+static void
+intr_disestablish_xcall(void *arg1, void *arg2)
 {
 	struct intrhand **p, *q;
 	struct cpu_info *ci;
 	struct pic *pic;
 	struct intrsource *source;
+	struct intrhand *ih;
+	u_long psl;
 	int idtvec;
 
+	ih = arg1;
 	ci = ih->ih_cpu;
+
+	KASSERT(ci == curcpu() || !mp_online);
+
+	/* Disable interrupts locally. */
+	psl = x86_read_psl();
+	x86_disable_intr();
+
 	pic = ci->ci_isources[ih->ih_slot]->is_pic;
 	source = ci->ci_isources[ih->ih_slot];
 	idtvec = source->is_idtvec;
 
-	mutex_enter(&x86_intr_lock);
-	pic->pic_hwmask(pic, ih->ih_pin);	
+	(*pic->pic_hwmask)(pic, ih->ih_pin);	
 	atomic_and_32(&ci->ci_ipending, ~(1 << ih->ih_slot));
 
 	/*
@@ -746,33 +850,56 @@
 	     p = &q->ih_next)
 		;
 	if (q == NULL) {
-		mutex_exit(&x86_intr_lock);
+		x86_write_psl(psl);
 		panic("intr_disestablish: handler not registered");
+		/* NOTREACHED */
 	}
 
 	*p = q->ih_next;
 
 	intr_calculatemasks(ci);
-	pic->pic_delroute(pic, ci, ih->ih_pin, idtvec, source->is_type);
-	pic->pic_hwunmask(pic, ih->ih_pin);
+	(*pic->pic_delroute)(pic, ci, ih->ih_pin, idtvec, source->is_type);
+	(*pic->pic_hwunmask)(pic, ih->ih_pin);
+
+	/* Re-enable interrupts. */
+	x86_write_psl(psl);
+
+	/* If the source is free we can drop it now. */
+	intr_source_free(ci, ih->ih_slot, pic, idtvec);
 
 #ifdef INTRDEBUG
 	printf("%s: remove slot %d (pic %s pin %d vec %d)\n",
 	    device_xname(ci->ci_dev), ih->ih_slot, pic->pic_name,
 	    ih->ih_pin, idtvec);
 #endif
+}
 
-	if (source->is_handlers == NULL) {
-		evcnt_detach(&source->is_evcnt);
-		free(source, M_DEVBUF);
-		ci->ci_isources[ih->ih_slot] = NULL;
-		if (pic != &i8259_pic)
-			idt_vec_free(idtvec);
-	}
-
-	free(ih, M_DEVBUF);
+/*
+ * Deregister an interrupt handler.
+ */
+void
+intr_disestablish(struct intrhand *ih)
+{
+	struct cpu_info *ci;
+	uint64_t where;
 
-	mutex_exit(&x86_intr_lock);
+	/*
+	 * Count the removal for load balancing.
+	 * Call out to the remote CPU to update its interrupt state.
+	 * Only make RPCs if the APs are up and running.
+	 */
+	mutex_enter(&cpu_lock);
+	ci = ih->ih_cpu;
+	(ci->ci_nintrhand)--;
+	KASSERT(ci->ci_nintrhand >= 0);
+	if (ci == curcpu() || !mp_online) {
+		intr_disestablish_xcall(ih, NULL);
+	} else {
+		where = xc_unicast(0, intr_disestablish_xcall, ih, NULL, ci);
+		xc_wait(where);
+	}	
+	mutex_exit(&cpu_lock);
+	kmem_free(ih, sizeof(*ih));
 }
 
 const char *
@@ -809,8 +936,6 @@
 
 }
 
-#define CONCAT(x,y)	__CONCAT(x,y)
-
 /*
  * Fake interrupt handler structures for the benefit of symmetry with
  * other interrupt sources, and the benefit of intr_calculatemasks()
@@ -859,9 +984,8 @@
 #endif
 
 #if NLAPIC > 0
-	isp = malloc(sizeof (struct intrsource), M_DEVBUF, M_WAITOK|M_ZERO);
-	if (isp == NULL)
-		panic("can't allocate fixed interrupt source");
+	isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
+	KASSERT(isp != NULL);
 	isp->is_recurse = Xrecurse_lapic_ltimer;
 	isp->is_resume = Xresume_lapic_ltimer;
 	fake_timer_intrhand.ih_level = IPL_CLOCK;
@@ -872,9 +996,8 @@
 	    device_xname(ci->ci_dev), "timer");
 
 #ifdef MULTIPROCESSOR
-	isp = malloc(sizeof (struct intrsource), M_DEVBUF, M_WAITOK|M_ZERO);
-	if (isp == NULL)
-		panic("can't allocate fixed interrupt source");
+	isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
+	KASSERT(isp != NULL);
 	isp->is_recurse = Xrecurse_lapic_ipi;
 	isp->is_resume = Xresume_lapic_ipi;
 	fake_ipi_intrhand.ih_level = IPL_IPI;
@@ -888,9 +1011,8 @@
 #endif
 #endif
 
-	isp = malloc(sizeof (struct intrsource), M_DEVBUF, M_WAITOK|M_ZERO);
-	if (isp == NULL)
-		panic("can't allocate fixed interrupt source");
+	isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
+	KASSERT(isp != NULL);
 	isp->is_recurse = Xpreemptrecurse;
 	isp->is_resume = Xpreemptresume;
 	fake_preempt_intrhand.ih_level = IPL_PREEMPT;
@@ -926,7 +1048,7 @@
 	ci->ci_idepth = -1;
 }
 
-#ifdef INTRDEBUG
+#if defined(INTRDEBUG) || defined(DDB)
 void
 intr_printconfig(void)
 {
@@ -967,9 +1089,8 @@
 
 	ci = l->l_cpu;
 
-	isp = malloc(sizeof (struct intrsource), M_DEVBUF, M_WAITOK|M_ZERO);
-	if (isp == NULL)
-		panic("can't allocate fixed interrupt source");
+	isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
+	KASSERT(isp != NULL);
 	isp->is_recurse = Xsoftintr;
 	isp->is_resume = Xsoftintr;
 	isp->is_pic = &softintr_pic;
@@ -1007,3 +1128,211 @@
 
 	intr_calculatemasks(ci);
 }
+
+static void
+intr_redistribute_xc_t(void *arg1, void *arg2)
+{
+	struct cpu_info *ci;
+	struct intrsource *isp;
+	int slot;
+	u_long psl;
+
+	ci = curcpu();
+	isp = arg1;
+	slot = (int)(intptr_t)arg2;
+
+	/* Disable interrupts locally. */
+	psl = x86_read_psl();
+	x86_disable_intr();
+
+	/* Hook it in and re-calculate masks. */
+	ci->ci_isources[slot] = isp;
+	intr_calculatemasks(curcpu());
+
+	/* Re-enable interrupts locally. */
+	x86_write_psl(psl);
+}
+
+static void
+intr_redistribute_xc_s1(void *arg1, void *arg2)
+{
+	struct pic *pic;
+	struct intrsource *isp;
+	struct cpu_info *nci;
+	u_long psl;
+
+	isp = arg1;
+	nci = arg2;
+
+	/*
+	 * Disable interrupts on-chip and mask the pin.  Back out
+	 * and let the interrupt be processed if one is pending.
+	 */
+	pic = isp->is_pic;
+	for (;;) {
+		psl = x86_read_psl();
+		x86_disable_intr();
+		if ((*pic->pic_trymask)(pic, isp->is_pin)) {
+			break;
+		}
+		x86_write_psl(psl);
+		DELAY(1000);
+	}
+
+	/* pic_addroute will unmask the interrupt. */
+	(*pic->pic_addroute)(pic, nci, isp->is_pin, isp->is_idtvec,
+	    isp->is_type);
+	x86_write_psl(psl);
+}
+
+static void
+intr_redistribute_xc_s2(void *arg1, void *arg2)
+{
+	struct cpu_info *ci;
+	u_long psl;
+	int slot;
+
+	ci = curcpu();
+	slot = (int)(uintptr_t)arg1;
+
+	/* Disable interrupts locally. */
+	psl = x86_read_psl();
+	x86_disable_intr();
+
+	/* Patch out the source and re-calculate masks. */
+	ci->ci_isources[slot] = NULL;
+	intr_calculatemasks(ci);
+
+	/* Re-enable interrupts locally. */
+	x86_write_psl(psl);
+}
+
+static bool
+intr_redistribute(struct cpu_info *oci)
+{
+	struct intrsource *isp;
+	struct intrhand *ih;
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *nci, *ici;
+	int oslot, nslot;
+	uint64_t where;
+
+	KASSERT(mutex_owned(&cpu_lock));
+
+	/* Look for an interrupt source that we can migrate. */
+	for (oslot = 0; oslot < MAX_INTR_SOURCES; oslot++) {
+		if ((isp = oci->ci_isources[oslot]) == NULL) {
+			continue;
+		}
+		if (isp->is_pic->pic_type == PIC_IOAPIC) {
+			break;
+		}
+	}
+	if (oslot == MAX_INTR_SOURCES) {
+		return false;
+	}
+
+	/* Find least loaded CPU and try to move there. */
+	nci = NULL;
+	for (CPU_INFO_FOREACH(cii, ici)) {
+		if ((ici->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
+			continue;
+		}
+		KASSERT(ici != oci);
+		if (nci == NULL || nci->ci_nintrhand > ici->ci_nintrhand) {
+			nci = ici;
+		}
+	}
+	if (nci == NULL) {
+		return false;
+	}
+	for (nslot = 0; nslot < MAX_INTR_SOURCES; nslot++) {
+		if (nci->ci_isources[nslot] == NULL) {
+			break;
+		}
+	}
+
+	/* If that did not work, allocate anywhere. */
+	if (nslot == MAX_INTR_SOURCES) {
+		for (CPU_INFO_FOREACH(cii, nci)) {
+			if ((nci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
+				continue;
+			}
+			KASSERT(nci != oci);
+			for (nslot = 0; nslot < MAX_INTR_SOURCES; nslot++) {
+				if (nci->ci_isources[nslot] == NULL) {
+					break;
+				}
+			}
+			if (nslot != MAX_INTR_SOURCES) {
+				break;
+			}
+		}
+	}
+	if (nslot == MAX_INTR_SOURCES) {
+		return false;
+	}
+
+	/*
+	 * Now we have new CPU and new slot.  Run a cross-call to set up
+	 * the new vector on the target CPU.
+	 */
+	where = xc_unicast(0, intr_redistribute_xc_t, isp,
+	    (void *)(intptr_t)nslot, nci);
+	xc_wait(where);
+	
+	/*
+	 * We're ready to go on the target CPU.  Run a cross call to
+	 * reroute the interrupt away from the source CPU.
+	 */
+	where = xc_unicast(0, intr_redistribute_xc_s1, isp, nci, oci);
+	xc_wait(where);
+
+	/* Sleep for (at least) 10ms to allow the change to take hold. */
+	(void)kpause("intrdist", false, mstohz(10), NULL);
+
+	/* Complete removal from the source CPU. */
+	where = xc_unicast(0, intr_redistribute_xc_s2,
+	    (void *)(uintptr_t)oslot, NULL, oci);
+	xc_wait(where);
+
+	/* Finally, take care of book-keeping. */
+	for (ih = isp->is_handlers; ih != NULL; ih = ih->ih_next) {
+		oci->ci_nintrhand--;
+		nci->ci_nintrhand++;
+		ih->ih_cpu = nci;
+	}
+
+	return true;
+}
+
+void
+cpu_intr_redistribute(void)
+{
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+
+	KASSERT(mutex_owned(&cpu_lock));
+	KASSERT(mp_online);
+
+	/* Direct interrupts away from shielded CPUs. */
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) == 0) {
+			continue;
+		}
+		while (intr_redistribute(ci)) {
+			/* nothing */
+		}
+	}
+
+	/* XXX should now re-balance */
+}
+
+u_int
+cpu_intr_count(struct cpu_info *ci)
+{
+
+	KASSERT(ci->ci_nintrhand >= 0);
+
+	return ci->ci_nintrhand;
+}

Index: src/sys/arch/x86/x86/ioapic.c
diff -u src/sys/arch/x86/x86/ioapic.c:1.39 src/sys/arch/x86/x86/ioapic.c:1.40
--- src/sys/arch/x86/x86/ioapic.c:1.39	Fri Feb 13 20:51:19 2009
+++ src/sys/arch/x86/x86/ioapic.c	Sun Apr 19 14:11:37 2009
@@ -1,11 +1,11 @@
-/* 	$NetBSD: ioapic.c,v 1.39 2009/02/13 20:51:19 bouyer Exp $	*/
+/* 	$NetBSD: ioapic.c,v 1.40 2009/04/19 14:11:37 ad Exp $	*/
 
 /*-
- * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 2000, 2009 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
- * by RedBack Networks Inc.
+ * by RedBack Networks Inc, and by Andrew Doran.
  *
  * Author: Bill Sommerfeld
  *
@@ -31,7 +31,6 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-
 /*
  * Copyright (c) 1999 Stefan Grefen
  *
@@ -65,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ioapic.c,v 1.39 2009/02/13 20:51:19 bouyer Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ioapic.c,v 1.40 2009/04/19 14:11:37 ad Exp $");
 
 #include "opt_ddb.h"
 
@@ -108,6 +107,7 @@
 
 void ioapic_hwmask(struct pic *, int);
 void ioapic_hwunmask(struct pic *, int);
+bool ioapic_trymask(struct pic *, int);
 static void ioapic_addroute(struct pic *, struct cpu_info *, int, int, int);
 static void ioapic_delroute(struct pic *, struct cpu_info *, int, int, int);
 
@@ -302,6 +302,7 @@
 	sc->sc_pic.pic_hwunmask = ioapic_hwunmask;
 	sc->sc_pic.pic_addroute = ioapic_addroute;
 	sc->sc_pic.pic_delroute = ioapic_delroute;
+	sc->sc_pic.pic_trymask = ioapic_trymask;
 	sc->sc_pic.pic_edge_stubs = ioapic_edge_stubs;
 	sc->sc_pic.pic_level_stubs = ioapic_level_stubs;
 
@@ -402,7 +403,6 @@
 	uint32_t redlo;
 	uint32_t redhi;
 	int delmode;
-
 	struct ioapic_pin *pp;
 	struct mp_intr_map *map;
 	
@@ -410,39 +410,28 @@
 	map = pp->ip_map;
 	redlo = map == NULL ? IOAPIC_REDLO_MASK : map->redir;
 	delmode = (redlo & IOAPIC_REDLO_DEL_MASK) >> IOAPIC_REDLO_DEL_SHIFT;
+	redhi = (ci->ci_cpuid << IOAPIC_REDHI_DEST_SHIFT);
 
-	/* XXX magic numbers */
-	if ((delmode != 0) && (delmode != 1))
-		redhi = 0;
-	else if (pp->ip_type == IST_NONE) {
-		redlo |= IOAPIC_REDLO_MASK;
-		redhi = 0;
-	} else {
-		redlo |= (idt_vec & 0xff);
-		redlo |= (IOAPIC_REDLO_DEL_FIXED<<IOAPIC_REDLO_DEL_SHIFT);
-		redlo &= ~IOAPIC_REDLO_DSTMOD;
-		
-		/*
-		 * Destination: BSP CPU
-		 *
-		 * XXX will want to distribute interrupts across CPUs
-		 * eventually.  most likely, we'll want to vector each
-		 * interrupt to a specific CPU and load-balance across
-		 * CPUs.  but there's no point in doing that until after 
-		 * most interrupts run without the kernel lock.  
-		 */
-		redhi = (ci->ci_cpuid << IOAPIC_REDHI_DEST_SHIFT);
+	if (delmode == IOAPIC_REDLO_DEL_FIXED ||
+	    delmode == IOAPIC_REDLO_DEL_LOPRI) {
+	    	if (pp->ip_type == IST_NONE) {
+			redlo |= IOAPIC_REDLO_MASK;
+		} else {
+			redlo |= (idt_vec & 0xff);
+			redlo |= (IOAPIC_REDLO_DEL_FIXED<<IOAPIC_REDLO_DEL_SHIFT);
+			redlo &= ~IOAPIC_REDLO_DSTMOD;
 
-		/* XXX derive this bit from BIOS info */
-		if (pp->ip_type == IST_LEVEL)
-			redlo |= IOAPIC_REDLO_LEVEL;
-		else
-			redlo &= ~IOAPIC_REDLO_LEVEL;
-		if (map != NULL && ((map->flags & 3) == MPS_INTPO_DEF)) {
+			/* XXX derive this bit from BIOS info */
 			if (pp->ip_type == IST_LEVEL)
-				redlo |= IOAPIC_REDLO_ACTLO;
+				redlo |= IOAPIC_REDLO_LEVEL;
 			else
-				redlo &= ~IOAPIC_REDLO_ACTLO;
+				redlo &= ~IOAPIC_REDLO_LEVEL;
+			if (map != NULL && ((map->flags & 3) == MPS_INTPO_DEF)) {
+				if (pp->ip_type == IST_LEVEL)
+					redlo |= IOAPIC_REDLO_ACTLO;
+				else
+					redlo &= ~IOAPIC_REDLO_ACTLO;
+			}
 		}
 	}
 	ioapic_write(sc, IOAPIC_REDHI(pin), redhi);
@@ -511,6 +500,33 @@
 	ioapic_unlock(sc, flags);
 }
 
+bool
+ioapic_trymask(struct pic *pic, int pin)
+{
+	uint32_t redlo;
+	struct ioapic_softc *sc = pic->pic_ioapic;
+	u_long flags;
+	bool rv;
+
+	/* Mask it. */
+	flags = ioapic_lock(sc);
+	redlo = ioapic_read_ul(sc, IOAPIC_REDLO(pin));
+	redlo |= IOAPIC_REDLO_MASK;
+	ioapic_write_ul(sc, IOAPIC_REDLO(pin), redlo);
+
+	/* If pending, unmask and abort. */
+	redlo = ioapic_read_ul(sc, IOAPIC_REDLO(pin));
+	if ((redlo & (IOAPIC_REDLO_RIRR|IOAPIC_REDLO_DELSTS)) != 0) {
+		redlo &= ~IOAPIC_REDLO_MASK;
+		ioapic_write_ul(sc, IOAPIC_REDLO(pin), redlo);
+		rv = false;
+	} else {
+		rv = true;
+	}
+	ioapic_unlock(sc, flags);
+	return rv;
+}
+
 void
 ioapic_hwunmask(struct pic *pic, int pin)
 {

Index: src/sys/kern/kern_cpu.c
diff -u src/sys/kern/kern_cpu.c:1.41 src/sys/kern/kern_cpu.c:1.42
--- src/sys/kern/kern_cpu.c:1.41	Mon Jan 19 23:04:26 2009
+++ src/sys/kern/kern_cpu.c	Sun Apr 19 14:11:37 2009
@@ -1,7 +1,7 @@
-/*	$NetBSD: kern_cpu.c,v 1.41 2009/01/19 23:04:26 njoly Exp $	*/
+/*	$NetBSD: kern_cpu.c,v 1.42 2009/04/19 14:11:37 ad Exp $	*/
 
 /*-
- * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -56,9 +56,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.41 2009/01/19 23:04:26 njoly Exp $");
-
-#include "opt_compat_netbsd.h"
+__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.42 2009/04/19 14:11:37 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -81,10 +79,6 @@
 
 #include <uvm/uvm_extern.h>
 
-#ifdef COMPAT_50
-#include <compat/sys/cpuio.h>
-#endif
-
 void	cpuctlattach(int);
 
 static void	cpu_xc_online(struct cpu_info *);
@@ -163,17 +157,6 @@
 
 	mutex_enter(&cpu_lock);
 	switch (cmd) {
-#ifdef IOC_CPU_OSETSTATE
-		cpustate_t csb;
-
-	case IOC_CPU_OSETSTATE: {
-		cpustate50_t *ocs = data;
-		cpustate50_to_cpustate(ocs, &csb);
-		cs = &csb;
-		error = 1;
-		/*FALLTHROUGH*/
-	}
-#endif
 	case IOC_CPU_SETSTATE:
 		if (error == 0)
 			cs = data;
@@ -187,22 +170,10 @@
 			error = ESRCH;
 			break;
 		}
-		if (!cs->cs_intr) {
-			error = EOPNOTSUPP;
-			break;
-		}
+		error = cpu_setintr(ci, cs->cs_intr);
 		error = cpu_setstate(ci, cs->cs_online);
 		break;
 
-#ifdef IOC_CPU_OGETSTATE
-	case IOC_CPU_OGETSTATE: {
-		cpustate50_t *ocs = data;
-		cpustate50_to_cpustate(ocs, &csb);
-		cs = &csb;
-		error = 1;
-		/*FALLTHROUGH*/
-	}
-#endif
 	case IOC_CPU_GETSTATE:
 		if (error == 0)
 			cs = data;
@@ -218,15 +189,14 @@
 			cs->cs_online = false;
 		else
 			cs->cs_online = true;
-		cs->cs_intr = true;
-		cs->cs_lastmod = ci->ci_schedstate.spc_lastmod;
-#ifdef IOC_CPU_OGETSTATE
-		if (cmd == IOC_CPU_OGETSTATE) {
-			cpustate50_t *ocs = data;
-			cpustate_to_cpustate50(cs, ocs);
-			error = 0;
-		}
-#endif
+		if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0)
+			cs->cs_intr = false;
+		else
+			cs->cs_intr = true;
+		cs->cs_lastmod = (int32_t)ci->ci_schedstate.spc_lastmod;
+		cs->cs_lastmodhi = (int32_t)
+		    (ci->ci_schedstate.spc_lastmod >> 32);
+		cs->cs_intrcnt = cpu_intr_count(ci) + 1;
 		break;
 
 	case IOC_CPU_MAPID:
@@ -275,15 +245,15 @@
 	int s;
 
 	/*
-	 * Thread which sent unicast (separate context) is holding
-	 * the cpu_lock for us.
+	 * Thread that made the cross call (separate context) holds
+	 * cpu_lock on our behalf.
 	 */
 	spc = &ci->ci_schedstate;
 	s = splsched();
 	spc->spc_flags |= SPCF_OFFLINE;
 	splx(s);
 
-	/* Take the first available CPU for the migration */
+	/* Take the first available CPU for the migration. */
 	for (CPU_INFO_FOREACH(cii, target_ci)) {
 		mspc = &target_ci->ci_schedstate;
 		if ((mspc->spc_flags & SPCF_OFFLINE) == 0)
@@ -402,6 +372,99 @@
 	return 0;
 }
 
+#ifdef __HAVE_INTR_CONTROL
+static void
+cpu_xc_intr(struct cpu_info *ci)
+{
+	struct schedstate_percpu *spc;
+	int s;
+
+	spc = &ci->ci_schedstate;
+	s = splsched();
+	spc->spc_flags &= ~SPCF_NOINTR;
+	splx(s);
+}
+
+static void
+cpu_xc_nointr(struct cpu_info *ci)
+{
+	struct schedstate_percpu *spc;
+	int s;
+
+	spc = &ci->ci_schedstate;
+	s = splsched();
+	spc->spc_flags |= SPCF_NOINTR;
+	splx(s);
+}
+
+int
+cpu_setintr(struct cpu_info *ci, bool intr)
+{
+	struct schedstate_percpu *spc;
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci2;
+	uint64_t where;
+	xcfunc_t func;
+	int nintr;
+
+	spc = &ci->ci_schedstate;
+
+	KASSERT(mutex_owned(&cpu_lock));
+
+	if (intr) {
+		if ((spc->spc_flags & SPCF_NOINTR) == 0)
+			return 0;
+		func = (xcfunc_t)cpu_xc_intr;
+	} else {
+		if ((spc->spc_flags & SPCF_NOINTR) != 0)
+			return 0;
+		/*
+		 * Ensure that at least one CPU within the system
+		 * is handing device interrupts.
+		 */
+		nintr = 0;
+		for (CPU_INFO_FOREACH(cii, ci2)) {
+			if ((ci2->ci_schedstate.spc_flags & SPCF_NOINTR) != 0)
+				continue;
+			if (ci2 == ci)
+				continue;
+			nintr++;
+		}
+		if (nintr == 0)
+			return EBUSY;
+		func = (xcfunc_t)cpu_xc_nointr;
+	}
+
+	where = xc_unicast(0, func, ci, NULL, ci);
+	xc_wait(where);
+	if (intr) {
+		KASSERT((spc->spc_flags & SPCF_NOINTR) == 0);
+	} else if ((spc->spc_flags & SPCF_NOINTR) == 0) {
+		/* If was not set offline, then it is busy */
+		return EBUSY;
+	}
+
+	/* Direct interrupts away from the CPU and record the change. */
+	cpu_intr_redistribute();
+	spc->spc_lastmod = time_second;
+	return 0;
+}
+#else	/* __HAVE_INTR_CONTROL */
+int
+cpu_setintr(struct cpu_info *ci, bool intr)
+{
+
+	return EOPNOTSUPP;
+}
+
+u_int
+cpu_intr_count(struct cpu_info *ci)
+{
+
+	return 0;	/* 0 == "don't know" */
+}
+#endif	/* __HAVE_INTR_CONTROL */
+
 bool
 cpu_softintr_p(void)
 {

Index: src/sys/sys/cpu.h
diff -u src/sys/sys/cpu.h:1.29 src/sys/sys/cpu.h:1.30
--- src/sys/sys/cpu.h:1.29	Thu Apr  9 00:34:44 2009
+++ src/sys/sys/cpu.h	Sun Apr 19 14:11:37 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.29 2009/04/09 00:34:44 yamt Exp $	*/
+/*	$NetBSD: cpu.h,v 1.30 2009/04/19 14:11:37 ad Exp $	*/
 
 /*-
  * Copyright (c) 2007 YAMAMOTO Takashi,
@@ -71,12 +71,15 @@
 lwp_t	*cpu_switchto(lwp_t *, lwp_t *, bool);
 struct	cpu_info *cpu_lookup(u_int);
 int	cpu_setstate(struct cpu_info *, bool);
+int	cpu_setintr(struct cpu_info *, bool);
 bool	cpu_intr_p(void);
 bool	cpu_softintr_p(void);
 bool	cpu_kpreempt_enter(uintptr_t, int);
 void	cpu_kpreempt_exit(uintptr_t);
 bool	cpu_kpreempt_disabled(void);
 int	cpu_lwp_setprivate(lwp_t *, void *);
+void	cpu_intr_redistribute(void);
+u_int	cpu_intr_count(struct cpu_info *);
 
 CIRCLEQ_HEAD(cpuqueue, cpu_info);
 

Index: src/sys/sys/cpuio.h
diff -u src/sys/sys/cpuio.h:1.3 src/sys/sys/cpuio.h:1.4
--- src/sys/sys/cpuio.h:1.3	Mon Jan 19 17:39:02 2009
+++ src/sys/sys/cpuio.h	Sun Apr 19 14:11:37 2009
@@ -1,7 +1,7 @@
-/*	$NetBSD: cpuio.h,v 1.3 2009/01/19 17:39:02 christos Exp $	*/
+/*	$NetBSD: cpuio.h,v 1.4 2009/04/19 14:11:37 ad Exp $	*/
 
 /*-
- * Copyright (c) 2007 The NetBSD Foundation, Inc.
+ * Copyright (c) 2007, 2009 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -49,16 +49,16 @@
 	bool		cs_online;	/* running unbound LWPs */
 	bool		cs_intr;	/* fielding interrupts */
 	bool		cs_unused[2];	/* reserved */
-	time_t		cs_lastmod;	/* time of last state change */
+	int32_t		cs_lastmod;	/* time of last state change */
 	char		cs_name[16];	/* reserved */
-	uint32_t	cs_reserved[4];	/* reserved */
+	int32_t		cs_lastmodhi;	/* time of last state change */
+	uint32_t	cs_intrcnt;	/* count of interrupt handlers + 1 */
+	uint32_t	cs_reserved[2];	/* reserved */
 } cpustate_t;
 
-/* 0 IOC_CPU_OSETSTATE */
-/* 1 IOC_CPU_OGETSTATE */
+#define	IOC_CPU_SETSTATE	_IOW('c', 0, cpustate_t)
+#define	IOC_CPU_GETSTATE	_IOWR('c', 1, cpustate_t)
 #define	IOC_CPU_GETCOUNT	_IOR('c', 2, int)
 #define	IOC_CPU_MAPID		_IOWR('c', 3, int)
-#define	IOC_CPU_SETSTATE	_IOW('c', 4, cpustate_t)
-#define	IOC_CPU_GETSTATE	_IOWR('c', 5, cpustate_t)
 
 #endif /* !_SYS_CPUIO_H_ */

Index: src/usr.sbin/cpuctl/cpuctl.c
diff -u src/usr.sbin/cpuctl/cpuctl.c:1.13 src/usr.sbin/cpuctl/cpuctl.c:1.14
--- src/usr.sbin/cpuctl/cpuctl.c:1.13	Wed Jan 28 22:37:09 2009
+++ src/usr.sbin/cpuctl/cpuctl.c	Sun Apr 19 14:11:38 2009
@@ -1,7 +1,7 @@
-/*	$NetBSD: cpuctl.c,v 1.13 2009/01/28 22:37:09 ad Exp $	*/
+/*	$NetBSD: cpuctl.c,v 1.14 2009/04/19 14:11:38 ad Exp $	*/
 
 /*-
- * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -31,7 +31,7 @@
 
 #ifndef lint
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: cpuctl.c,v 1.13 2009/01/28 22:37:09 ad Exp $");
+__RCSID("$NetBSD: cpuctl.c,v 1.14 2009/04/19 14:11:38 ad Exp $");
 #endif /* not lint */
 
 #include <sys/param.h>
@@ -61,6 +61,8 @@
 void	cpu_list(char **);
 void	cpu_offline(char **);
 void	cpu_online(char **);
+void	cpu_intr(char **);
+void	cpu_nointr(char **);
 
 struct cmdtab {
 	const char	*label;
@@ -71,6 +73,8 @@
 	{ "list", 0, cpu_list },
 	{ "offline", 1, cpu_offline },
 	{ "online", 1, cpu_online },
+	{ "intr", 1, cpu_intr },
+	{ "nointr", 1, cpu_nointr },
 	{ NULL, 0, NULL },
 };
 
@@ -113,6 +117,8 @@
 	fprintf(stderr, "       %s list\n", progname);
 	fprintf(stderr, "       %s offline cpuno\n", progname);
 	fprintf(stderr, "       %s online cpuno\n", progname);
+	fprintf(stderr, "       %s intr cpuno\n", progname);
+	fprintf(stderr, "       %s nointr cpuno\n", progname);
 	exit(EXIT_FAILURE);
 	/* NOTREACHED */
 }
@@ -144,6 +150,37 @@
 }
 
 void
+cpu_intr(char **argv)
+{
+	cpustate_t cs;
+
+	cs.cs_id = getcpuid(argv);
+	if (ioctl(fd, IOC_CPU_GETSTATE, &cs) < 0)
+		err(EXIT_FAILURE, "IOC_CPU_GETSTATE");
+	cs.cs_intr = true;
+	if (ioctl(fd, IOC_CPU_SETSTATE, &cs) < 0)
+		err(EXIT_FAILURE, "IOC_CPU_SETSTATE");
+}
+
+void
+cpu_nointr(char **argv)
+{
+	cpustate_t cs;
+
+	cs.cs_id = getcpuid(argv);
+	if (ioctl(fd, IOC_CPU_GETSTATE, &cs) < 0)
+		err(EXIT_FAILURE, "IOC_CPU_GETSTATE");
+	cs.cs_intr = false;
+	if (ioctl(fd, IOC_CPU_SETSTATE, &cs) < 0) {
+		if (errno == EOPNOTSUPP) {
+			warnx("interrupt control not supported on "
+			    "this platform");
+		} else
+			err(EXIT_FAILURE, "IOC_CPU_SETSTATE");
+	}
+}
+
+void
 cpu_identify(char **argv)
 {
 	char name[32];
@@ -198,12 +235,15 @@
 	const char *state, *intr;
 	cpustate_t cs;
 	u_int cnt, i;
+	time_t lastmod;
+	char ibuf[16], *ts;
 	
 	if (ioctl(fd, IOC_CPU_GETCOUNT, &cnt) < 0)
 		err(EXIT_FAILURE, "IOC_CPU_GETCOUNT");
 
-	printf("Num  HwId Unbound LWPs Interrupts     Last change\n");
- 	printf("---- ---- ------------ -------------- ----------------------------\n");
+	printf(
+"Num  HwId Unbound LWPs Interrupts Last change              #Intr\n"
+"---- ---- ------------ ---------- ------------------------ -----\n");
 
 	for (i = 0; i < cnt; i++) {
 		cs.cs_id = i;
@@ -219,8 +259,16 @@
 			intr = "intr";
 		else
 			intr = "nointr";
-		printf("%-4d %-4x %-12s %-12s   %s", i, cs.cs_id, state,
-		   intr, asctime(localtime(&cs.cs_lastmod)));
+		if (cs.cs_intrcnt == 0)
+			strcpy(ibuf, "?");
+		else
+			snprintf(ibuf, sizeof(ibuf), "%d", cs.cs_intrcnt - 1);
+		lastmod = (time_t)cs.cs_lastmod |
+		    ((time_t)cs.cs_lastmodhi << 32);
+		ts = asctime(localtime(&lastmod));
+		ts[strlen(ts) - 1] = '\0';
+		printf("%-4d %-4x %-12s %-10s %s %s\n", i, cs.cs_id, state,
+		   intr, ts, ibuf);
 	}
 }
 

Reply via email to