[PATCH x86 for review II] [4/39] i386: add idle notifier

2007-02-11 Thread Andi Kleen

From: Stephane Eranian <[EMAIL PROTECTED]>

Add a notifier mechanism to the low level idle loop.  You can register a
callback function which gets invoked on entry and exit from the low level idle
loop.  The low level idle loop is defined as the polling loop, low-power call,
or the mwait instruction.  Interrupts processed by the idle thread are not
considered part of the low level loop.

The notifier can be used to measure precisely how much is spent in useless
execution (or low power mode).  The perfmon subsystem uses it to turn on/off
monitoring.

Signed-off-by: stephane eranian <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---

 arch/i386/kernel/apic.c  |4 ++
 arch/i386/kernel/cpu/mcheck/p4.c |2 +
 arch/i386/kernel/irq.c   |3 ++
 arch/i386/kernel/process.c   |   53 ++-
 arch/i386/kernel/smp.c   |2 +
 include/asm-i386/idle.h  |   14 ++
 include/asm-i386/processor.h |8 +
 7 files changed, 85 insertions(+), 1 deletion(-)

Index: linux/arch/i386/kernel/apic.c
===
--- linux.orig/arch/i386/kernel/apic.c
+++ linux/arch/i386/kernel/apic.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1255,6 +1256,7 @@ fastcall void smp_apic_timer_interrupt(s
 * Besides, if we don't timer interrupts ignore the global
 * interrupt lock, which is the WrongThing (tm) to do.
 */
+   exit_idle();
irq_enter();
smp_local_timer_interrupt();
irq_exit();
@@ -1305,6 +1307,7 @@ fastcall void smp_spurious_interrupt(str
 {
unsigned long v;
 
+   exit_idle();
irq_enter();
/*
 * Check if this really is a spurious interrupt and ACK it
@@ -1329,6 +1332,7 @@ fastcall void smp_error_interrupt(struct
 {
unsigned long v, v1;
 
+   exit_idle();
irq_enter();
/* First tickle the hardware, only then report what went on. -- REW */
v = apic_read(APIC_ESR);
Index: linux/arch/i386/kernel/cpu/mcheck/p4.c
===
--- linux.orig/arch/i386/kernel/cpu/mcheck/p4.c
+++ linux/arch/i386/kernel/cpu/mcheck/p4.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -59,6 +60,7 @@ static void (*vendor_thermal_interrupt)(
 
 fastcall void smp_thermal_interrupt(struct pt_regs *regs)
 {
+   exit_idle();
irq_enter();
vendor_thermal_interrupt(regs);
irq_exit();
Index: linux/arch/i386/kernel/irq.c
===
--- linux.orig/arch/i386/kernel/irq.c
+++ linux/arch/i386/kernel/irq.c
@@ -19,6 +19,8 @@
 #include 
 #include 
 
+#include 
+
 DEFINE_PER_CPU(irq_cpustat_t, irq_stat) cacheline_internodealigned_in_smp;
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
@@ -61,6 +63,7 @@ fastcall unsigned int do_IRQ(struct pt_r
union irq_ctx *curctx, *irqctx;
u32 *isp;
 #endif
+   exit_idle();
 
if (unlikely((unsigned)irq >= NR_IRQS)) {
printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
Index: linux/arch/i386/kernel/process.c
===
--- linux.orig/arch/i386/kernel/process.c
+++ linux/arch/i386/kernel/process.c
@@ -48,6 +48,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef CONFIG_MATH_EMULATION
 #include 
 #endif
@@ -80,6 +81,42 @@ void (*pm_idle)(void);
 EXPORT_SYMBOL(pm_idle);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+   atomic_notifier_chain_register(_notifier, n);
+}
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+   atomic_notifier_chain_unregister(_notifier, n);
+}
+
+static DEFINE_PER_CPU(volatile unsigned long, idle_state);
+
+void enter_idle(void)
+{
+   /* needs to be atomic w.r.t. interrupts, not against other CPUs */
+   __set_bit(0, &__get_cpu_var(idle_state));
+   atomic_notifier_call_chain(_notifier, IDLE_START, NULL);
+}
+
+static void __exit_idle(void)
+{
+   /* needs to be atomic w.r.t. interrupts, not against other CPUs */
+   if (__test_and_clear_bit(0, &__get_cpu_var(idle_state)) == 0)
+   return;
+   atomic_notifier_call_chain(_notifier, IDLE_END, NULL);
+}
+
+void exit_idle(void)
+{
+   if (current->pid)
+   return;
+   __exit_idle();
+}
+
 void disable_hlt(void)
 {
hlt_counter++;
@@ -130,6 +167,7 @@ EXPORT_SYMBOL(default_idle);
  */
 static void poll_idle (void)
 {
+   local_irq_enable();
cpu_relax();
 }
 
@@ -189,7 +227,16 @@ void cpu_idle(void)
play_dead();
 
__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+
+   

Re: [PATCH x86 for review II] [18/39] x86_64: Allow to run a program when a machine check event is detected

2007-02-11 Thread Oliver Neukum
Am Montag, 12. Februar 2007 08:38 schrieb Andi Kleen:
> When a machine check event is detected (including a AMD RevF threshold 
> overflow event) allow to run a "trigger" program. This allows user space
> to react to such events sooner.

Could this not be merged with other reporting mechanisms? This looks like
a new incarnation of the /etc/hotplug code.

Regards
Oliver
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [5/39] i386: improve sched_clock() on i686

2007-02-11 Thread Andi Kleen

From: Ingo Molnar <[EMAIL PROTECTED]>

Clean up sched_clock() on i686: it will use the TSC if available and falls
back to jiffies only if the user asked for it to be disabled via notsc or
the CPU calibration code didnt figure out the right cpu_khz.

This generally makes the scheduler timestamps more finegrained, on all
hardware.  (the current scheduler is pretty resistant against asynchronous
sched_clock() values on different CPUs, it will allow at most up to a jiffy
of jitter.)

Also simplify sched_clock()'s check for TSC availability: propagate the
desire and ability to use the TSC into the tsc_disable flag, previously
this flag only indicated whether the notsc option was passed.  This makes
the rare low-res sched_clock() codepath a single branch off a read-mostly
flag.

Signed-off-by: Ingo Molnar <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---

 arch/i386/kernel/tsc.c  |   22 ++
 include/asm-i386/bugs.h |2 +-
 2 files changed, 15 insertions(+), 9 deletions(-)

Index: linux/arch/i386/kernel/tsc.c
===
--- linux.orig/arch/i386/kernel/tsc.c
+++ linux/arch/i386/kernel/tsc.c
@@ -112,13 +112,10 @@ unsigned long long sched_clock(void)
return (*custom_sched_clock)();
 
/*
-* in the NUMA case we dont use the TSC as they are not
-* synchronized across all CPUs.
+* Fall back to jiffies if there's no TSC available:
 */
-#ifndef CONFIG_NUMA
-   if (!cpu_khz || check_tsc_unstable())
-#endif
-   /* no locking but a rare wrong value is not a big deal */
+   if (unlikely(tsc_disable))
+   /* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (10 / HZ);
 
/* read the Time Stamp Counter: */
@@ -198,13 +195,13 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
 void __init tsc_init(void)
 {
if (!cpu_has_tsc || tsc_disable)
-   return;
+   goto out_no_tsc;
 
cpu_khz = calculate_cpu_khz();
tsc_khz = cpu_khz;
 
if (!cpu_khz)
-   return;
+   goto out_no_tsc;
 
printk("Detected %lu.%03lu MHz processor.\n",
(unsigned long)cpu_khz / 1000,
@@ -212,6 +209,15 @@ void __init tsc_init(void)
 
set_cyc2ns_scale(cpu_khz);
use_tsc_delay();
+   return;
+
+out_no_tsc:
+   /*
+* Set the tsc_disable flag if there's no TSC support, this
+* makes it a fast flag for the kernel to see whether it
+* should be using the TSC.
+*/
+   tsc_disable = 1;
 }
 
 #ifdef CONFIG_CPU_FREQ
Index: linux/include/asm-i386/bugs.h
===
--- linux.orig/include/asm-i386/bugs.h
+++ linux/include/asm-i386/bugs.h
@@ -160,7 +160,7 @@ static void __init check_config(void)
  * If we configured ourselves for a TSC, we'd better have one!
  */
 #ifdef CONFIG_X86_TSC
-   if (!cpu_has_tsc)
+   if (!cpu_has_tsc && !tsc_disable)
panic("Kernel compiled for Pentium+, requires TSC feature!");
 #endif
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [12/39] i386: Handle 32 bit PerfMon Counter writes cleanly in oprofile

2007-02-11 Thread Andi Kleen

From: Venkatesh Pallipadi <[EMAIL PROTECTED]>

Handle these 32 bit perfmon counter MSR writes cleanly in oprofile.

Signed-off-by: Venkatesh Pallipadi <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/i386/oprofile/op_model_ppro.c |9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

Index: linux/arch/i386/oprofile/op_model_ppro.c
===
--- linux.orig/arch/i386/oprofile/op_model_ppro.c
+++ linux/arch/i386/oprofile/op_model_ppro.c
@@ -24,7 +24,8 @@
 
 #define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
 #define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} 
while (0)
-#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 
-1);} while (0)
+#define CTR_32BIT_WRITE(l,msrs,c)  \
+   do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0);} while (0)
 #define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
 
 #define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
@@ -79,7 +80,7 @@ static void ppro_setup_ctrs(struct op_ms
for (i = 0; i < NUM_COUNTERS; ++i) {
if (unlikely(!CTR_IS_RESERVED(msrs,i)))
continue;
-   CTR_WRITE(1, msrs, i);
+   CTR_32BIT_WRITE(1, msrs, i);
}
 
/* enable active counters */
@@ -87,7 +88,7 @@ static void ppro_setup_ctrs(struct op_ms
if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) {
reset_value[i] = counter_config[i].count;
 
-   CTR_WRITE(counter_config[i].count, msrs, i);
+   CTR_32BIT_WRITE(counter_config[i].count, msrs, i);
 
CTRL_READ(low, high, msrs, i);
CTRL_CLEAR(low);
@@ -116,7 +117,7 @@ static int ppro_check_ctrs(struct pt_reg
CTR_READ(low, high, msrs, i);
if (CTR_OVERFLOWED(low)) {
oprofile_add_sample(regs, i);
-   CTR_WRITE(reset_value[i], msrs, i);
+   CTR_32BIT_WRITE(reset_value[i], msrs, i);
}
}
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [11/39] i386: Handle 32 bit PerfMon Counter writes cleanly in i386 nmi_watchdog

2007-02-11 Thread Andi Kleen

From: Venkatesh Pallipadi <[EMAIL PROTECTED]>

Change i386 nmi handler to handle 32 bit perfmon counter MSR writes cleanly.

Signed-off-by: Venkatesh Pallipadi <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/i386/kernel/nmi.c |   64 -
 1 file changed, 48 insertions(+), 16 deletions(-)

Index: linux/arch/i386/kernel/nmi.c
===
--- linux.orig/arch/i386/kernel/nmi.c
+++ linux/arch/i386/kernel/nmi.c
@@ -216,6 +216,28 @@ static __init void nmi_cpu_busy(void *da
 }
 #endif
 
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+   u64 counter_val;
+   unsigned int retval = hz;
+
+   /*
+* On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
+* are writable, with higher bits sign extending from bit 31.
+* So, we can only program the counter with 31 bit values and
+* 32nd bit should be 1, for 33.. to be 1.
+* Find the appropriate nmi_hz
+*/
+   counter_val = (u64)cpu_khz * 1000;
+   do_div(counter_val, retval);
+   if (counter_val > 0x7fffULL) {
+   u64 count = (u64)cpu_khz * 1000;
+   do_div(count, 0x7fffUL);
+   retval = count + 1;
+   }
+   return retval;
+}
+
 static int __init check_nmi_watchdog(void)
 {
unsigned int *prev_nmi_count;
@@ -281,18 +303,10 @@ static int __init check_nmi_watchdog(voi
struct nmi_watchdog_ctlblk *wd = 
&__get_cpu_var(nmi_watchdog_ctlblk);
 
nmi_hz = 1;
-   /*
-* On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
-* are writable, with higher bits sign extending from bit 31.
-* So, we can only program the counter with 31 bit values and
-* 32nd bit should be 1, for 33.. to be 1.
-* Find the appropriate nmi_hz
-*/
-   if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
-   ((u64)cpu_khz * 1000) > 0x7fffULL) {
-   u64 count = (u64)cpu_khz * 1000;
-   do_div(count, 0x7fffUL);
-   nmi_hz = count + 1;
+
+   if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
+   wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
+   nmi_hz = adjust_for_32bit_ctr(nmi_hz);
}
}
 
@@ -442,6 +456,17 @@ static void write_watchdog_counter(unsig
wrmsrl(perfctr_msr, 0 - count);
 }
 
+static void write_watchdog_counter32(unsigned int perfctr_msr,
+   const char *descr)
+{
+   u64 count = (u64)cpu_khz * 1000;
+
+   do_div(count, nmi_hz);
+   if(descr)
+   Dprintk("setting %s to -0x%08Lx\n", descr, count);
+   wrmsr(perfctr_msr, (u32)(-count), 0);
+}
+
 /* Note that these events don't tick when the CPU idles. This means
the frequency varies with CPU load. */
 
@@ -531,7 +556,8 @@ static int setup_p6_watchdog(void)
 
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
-   write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
+   nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+   write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= P6_EVNTSEL0_ENABLE;
wrmsr(evntsel_msr, evntsel, 0);
@@ -704,7 +730,8 @@ static int setup_intel_arch_watchdog(voi
 
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
-   write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
+   nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+   write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
wrmsr(evntsel_msr, evntsel, 0);
@@ -956,6 +983,8 @@ __kprobes int nmi_watchdog_tick(struct p
dummy &= ~P4_CCCR_OVF;
wrmsrl(wd->cccr_msr, dummy);
apic_write(APIC_LVTPC, APIC_DM_NMI);
+   /* start the cycle over again */
+   write_watchdog_counter(wd->perfctr_msr, NULL);
}
else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
@@ -964,9 +993,12 @@ __kprobes int nmi_watchdog_tick(struct p
 * other P6 variant.
 * ArchPerfom/Core Duo also needs this */
apic_write(APIC_LVTPC, APIC_DM_NMI);
+   /* P6/ARCH_PERFMON has 32 bit counter write */
+   write_watchdog_counter32(wd->perfctr_msr, NULL);
+   } else {
+   /* start the cycle over again */

[PATCH x86 for review II] [13/39] i386: CONFIG_PHYSICAL_ALIGN limited to 4M?

2007-02-11 Thread Andi Kleen

From: Rene Herman <[EMAIL PROTECTED]>
A while ago it was remarked on list here that keeping the kernel 4M 
aligned physically might be a performance win if the added 1M (it 
normally loads at 1M) meant it would fit on one 4M aligned hugepage 
instead of 2 and since that time I've been doing such.

In fact, while I was at it, I ran the kernel at 16M; while admittedly a 
bit of a non-issue, having never experienced ZONE_DMA shortage, I am an 
ISA user on a >16M machine so this seemed to make sense -- no kernel 
eating up "precious" ISA-DMAable memory.

Recently CONFIG_PHYSICAL_START was replaced by CONFIG_PHYSICAL_ALIGN 
(commit e69f202d0a1419219198566e1c22218a5c71a9a6) and while 4M alignment 
is still possible, that's also the strictest alignment allowed meaning I 
can't load my (non-relocatable) kernel at 16M anymore.

If I just apply the following and set it to 16M, things seem to be 
working for me. Was there an important reason to limit the alignment to 
4M, and if so, even on non relocatable kernels?

Rene.

Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/i386/Kconfig |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/arch/i386/Kconfig
===
--- linux.orig/arch/i386/Kconfig
+++ linux/arch/i386/Kconfig
@@ -843,7 +843,7 @@ config RELOCATABLE
 config PHYSICAL_ALIGN
hex "Alignment value to which kernel should be aligned"
default "0x10"
-   range 0x2000 0x40
+   range 0x2000 0x100
help
  This value puts the alignment restrictions on physical address
  where kernel is loaded and run from. Kernel is compiled for an
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [10/39] x86_64: Handle 32 bit PerfMon Counter writes cleanly in x86_64 nmi_watchdog

2007-02-11 Thread Andi Kleen

From: Venkatesh Pallipadi <[EMAIL PROTECTED]>


P6 CPUs and Core/Core 2 CPUs which has 'architectural perf mon' feature,
only supports write of low 32 bits in Performance Monitoring Counters.
Bits 32..39 are sign extended based on bit 31 and bits 40..63 are reserved
and should be zero.

This patch:

Change x86_64 nmi handler to handle this case cleanly.

Signed-off-by: Venkatesh Pallipadi <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/x86_64/kernel/nmi.c |   46 --
 1 file changed, 32 insertions(+), 14 deletions(-)

Index: linux/arch/x86_64/kernel/nmi.c
===
--- linux.orig/arch/x86_64/kernel/nmi.c
+++ linux/arch/x86_64/kernel/nmi.c
@@ -214,6 +214,23 @@ static __init void nmi_cpu_busy(void *da
 }
 #endif
 
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+   unsigned int retval = hz;
+
+   /*
+* On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
+* are writable, with higher bits sign extending from bit 31.
+* So, we can only program the counter with 31 bit values and
+* 32nd bit should be 1, for 33.. to be 1.
+* Find the appropriate nmi_hz
+*/
+   if u64)cpu_khz * 1000) / retval) > 0x7fffULL) {
+   retval = ((u64)cpu_khz * 1000) / 0x7fffUL + 1;
+   }
+   return retval;
+}
+
 int __init check_nmi_watchdog (void)
 {
int *counts;
@@ -268,17 +285,8 @@ int __init check_nmi_watchdog (void)
struct nmi_watchdog_ctlblk *wd = 
&__get_cpu_var(nmi_watchdog_ctlblk);
 
nmi_hz = 1;
-   /*
-* On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
-* are writable, with higher bits sign extending from bit 31.
-* So, we can only program the counter with 31 bit values and
-* 32nd bit should be 1, for 33.. to be 1.
-* Find the appropriate nmi_hz
-*/
-   if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
-   ((u64)cpu_khz * 1000) > 0x7fffULL) {
-   nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffUL + 1;
-   }
+   if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0)
+   nmi_hz = adjust_for_32bit_ctr(nmi_hz);
}
 
kfree(counts);
@@ -634,7 +642,9 @@ static int setup_intel_arch_watchdog(voi
 
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
-   wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+
+   nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+   wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
 
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
@@ -855,15 +865,23 @@ int __kprobes nmi_watchdog_tick(struct p
dummy &= ~P4_CCCR_OVF;
wrmsrl(wd->cccr_msr, dummy);
apic_write(APIC_LVTPC, APIC_DM_NMI);
+   /* start the cycle over again */
+   wrmsrl(wd->perfctr_msr,
+  -((u64)cpu_khz * 1000 / nmi_hz));
} else if (wd->perfctr_msr == 
MSR_ARCH_PERFMON_PERFCTR0) {
/*
 * ArchPerfom/Core Duo needs to re-unmask
 * the apic vector
 */
apic_write(APIC_LVTPC, APIC_DM_NMI);
+   /* ARCH_PERFMON has 32 bit counter writes */
+   wrmsr(wd->perfctr_msr,
+(u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
+   } else {
+   /* start the cycle over again */
+   wrmsrl(wd->perfctr_msr,
+  -((u64)cpu_khz * 1000 / nmi_hz));
}
-   /* start the cycle over again */
-   wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / 
nmi_hz));
rc = 1;
} else  if (nmi_watchdog == NMI_IO_APIC) {
/* don't know how to accurately check for this.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [9/39] x86_64: Use constant instead of raw number in x86_64 ioperm.c

2007-02-11 Thread Andi Kleen

From: Glauber de Oliveira Costa <[EMAIL PROTECTED]>

This is a tiny cleanup to increase readability

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>
Cc: Andi Kleen <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
---

 arch/x86_64/kernel/ioport.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/arch/x86_64/kernel/ioport.c
===
--- linux.orig/arch/x86_64/kernel/ioport.c
+++ linux/arch/x86_64/kernel/ioport.c
@@ -114,6 +114,6 @@ asmlinkage long sys_iopl(unsigned int le
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
}
-   regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
+   regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
return 0;
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [8/39] x86_64: Remove fastcall references in x86_64 code

2007-02-11 Thread Andi Kleen

From: Glauber de Oliveira Costa <[EMAIL PROTECTED]>

Unlike x86, x86_64 already passes arguments in registers.  The use of
regparm attribute makes no difference in produced code, and the use of
fastcall just bloats the code.

Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>
Cc: Andi Kleen <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
---

 arch/x86_64/kernel/acpi/sleep.c  |2 +-
 arch/x86_64/kernel/x8664_ksyms.c |4 ++--
 include/asm-x86_64/hw_irq.h  |2 +-
 include/asm-x86_64/mutex.h   |6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

Index: linux/arch/x86_64/kernel/acpi/sleep.c
===
--- linux.orig/arch/x86_64/kernel/acpi/sleep.c
+++ linux/arch/x86_64/kernel/acpi/sleep.c
@@ -58,7 +58,7 @@ unsigned long acpi_wakeup_address = 0;
 unsigned long acpi_video_flags;
 extern char wakeup_start, wakeup_end;
 
-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 
 static pgd_t low_ptr;
 
Index: linux/arch/x86_64/kernel/x8664_ksyms.c
===
--- linux.orig/arch/x86_64/kernel/x8664_ksyms.c
+++ linux/arch/x86_64/kernel/x8664_ksyms.c
@@ -36,8 +36,8 @@ EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
 #ifdef CONFIG_SMP
-extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
-extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
+extern void  __write_lock_failed(rwlock_t *rw);
+extern void  __read_lock_failed(rwlock_t *rw);
 EXPORT_SYMBOL(__write_lock_failed);
 EXPORT_SYMBOL(__read_lock_failed);
 #endif
Index: linux/include/asm-x86_64/hw_irq.h
===
--- linux.orig/include/asm-x86_64/hw_irq.h
+++ linux/include/asm-x86_64/hw_irq.h
@@ -91,7 +91,7 @@ extern void enable_8259A_irq(unsigned in
 extern int i8259A_irq_pending(unsigned int irq);
 extern void make_8259A_irq(unsigned int irq);
 extern void init_8259A(int aeoi);
-extern void FASTCALL(send_IPI_self(int vector));
+extern void send_IPI_self(int vector);
 extern void init_VISWS_APIC_irqs(void);
 extern void setup_IO_APIC(void);
 extern void disable_IO_APIC(void);
Index: linux/include/asm-x86_64/mutex.h
===
--- linux.orig/include/asm-x86_64/mutex.h
+++ linux/include/asm-x86_64/mutex.h
@@ -21,7 +21,7 @@ do {  
\
unsigned long dummy;\
\
typecheck(atomic_t *, v);   \
-   typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);   \
+   typecheck_fn(void (*)(atomic_t *), fail_fn);\
\
__asm__ __volatile__(   \
LOCK_PREFIX "   decl (%%rdi)\n" \
@@ -47,7 +47,7 @@ do {  
\
  */
 static inline int
 __mutex_fastpath_lock_retval(atomic_t *count,
-int fastcall (*fail_fn)(atomic_t *))
+int (*fail_fn)(atomic_t *))
 {
if (unlikely(atomic_dec_return(count) < 0))
return fail_fn(count);
@@ -67,7 +67,7 @@ do {  
\
unsigned long dummy;\
\
typecheck(atomic_t *, v);   \
-   typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);   \
+   typecheck_fn(void (*)(atomic_t *), fail_fn);\
\
__asm__ __volatile__(   \
LOCK_PREFIX "   incl (%%rdi)\n" \
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [3/39] i386: arch/i386/kernel/cpu/mcheck/mce.c should #include

2007-02-11 Thread Andi Kleen

From: Adrian Bunk <[EMAIL PROTECTED]>

Every file should include the headers containing the prototypes for
it's global functions.

Signed-off-by: Adrian Bunk <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---

 arch/i386/kernel/cpu/mcheck/mce.c |1 +
 1 file changed, 1 insertion(+)

Index: linux/arch/i386/kernel/cpu/mcheck/mce.c
===
--- linux.orig/arch/i386/kernel/cpu/mcheck/mce.c
+++ linux/arch/i386/kernel/cpu/mcheck/mce.c
@@ -12,6 +12,7 @@
 
 #include  
 #include 
+#include 
 
 #include "mce.h"
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [7/39] x86_64: Fix fake numa for x86_64 machines with big IO hole

2007-02-11 Thread Andi Kleen

From: Rohit Seth <[EMAIL PROTECTED]>

This patch resolves the issue of running with numa=fake=X on kernel command
line on x86_64 machines that have big IO hole.  While calculating the size
of each node now we look at the total hole size in that range.

Previously there were nodes that only had IO holes in them causing kernel
boot problems.  We now use the NODE_MIN_SIZE (64MB) as the minimum size of
memory that any node must have.  We reduce the number of allocated nodes if
the number of nodes specified on kernel command line results in any node
getting memory smaller than NODE_MIN_SIZE.

This change allows the extra memory to be incremented in NODE_MIN_SIZE
granule and uniformly distribute among as many nodes (called big nodes) as
possible.

[EMAIL PROTECTED]: build fix]
Signed-off-by: David Rientjes <[EMAIL PROTECTED]>
Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
Signed-off-by: Rohit Seth <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>
Cc: Andi Kleen <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
---

 arch/x86_64/kernel/e820.c   |   31 
 arch/x86_64/mm/numa.c   |  110 ++--
 include/asm-x86_64/e820.h   |1 
 include/asm-x86_64/mmzone.h |5 ++
 4 files changed, 133 insertions(+), 14 deletions(-)

Index: linux/arch/x86_64/kernel/e820.c
===
--- linux.orig/arch/x86_64/kernel/e820.c
+++ linux/arch/x86_64/kernel/e820.c
@@ -191,6 +191,37 @@ unsigned long __init e820_end_of_ram(voi
 }
 
 /*
+ * Find the hole size in the range.
+ */
+unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
+{
+   unsigned long ram = 0;
+   int i;
+
+   for (i = 0; i < e820.nr_map; i++) {
+   struct e820entry *ei = [i];
+   unsigned long last, addr;
+
+   if (ei->type != E820_RAM ||
+   ei->addr+ei->size <= start ||
+   ei->addr >= end)
+   continue;
+
+   addr = round_up(ei->addr, PAGE_SIZE);
+   if (addr < start)
+   addr = start;
+
+   last = round_down(ei->addr + ei->size, PAGE_SIZE);
+   if (last >= end)
+   last = end;
+
+   if (last > addr)
+   ram += last - addr;
+   }
+   return ((end - start) - ram);
+}
+
+/*
  * Mark e820 reserved areas as busy for the resource manager.
  */
 void __init e820_reserve_resources(void)
Index: linux/arch/x86_64/mm/numa.c
===
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -272,31 +272,113 @@ void __init numa_init_array(void)
 }
 
 #ifdef CONFIG_NUMA_EMU
+/* Numa emulation */
 int numa_fake __initdata = 0;
 
-/* Numa emulation */
+/*
+ * This function is used to find out if the start and end correspond to
+ * different zones.
+ */
+int zone_cross_over(unsigned long start, unsigned long end)
+{
+   if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
+   (end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
+   return 1;
+   return 0;
+}
+
 static int __init numa_emulation(unsigned long start_pfn, unsigned long 
end_pfn)
 {
-   int i;
+   int i, big;
struct bootnode nodes[MAX_NUMNODES];
-   unsigned long sz = ((end_pfn - start_pfn)< 1) {
-   unsigned long x = 1;
-   while ((x << 1) < sz)
-   x <<= 1;
-   if (x < sz/2)
-   printk(KERN_ERR "Numa emulation unbalanced. Complain to 
maintainer\n");
-   sz = x;
-   }
 
+   old_sz = sz;
+   /*
+* Round down to the nearest FAKE_NODE_MIN_SIZE.
+*/
+   sz &= FAKE_NODE_MIN_HASH_MASK;
+
+   /*
+* We ensure that each node is at least 64MB big.  Smaller than this
+* size can cause VM hiccups.
+*/
+   if (sz == 0) {
+   printk(KERN_INFO "Not enough memory for %d nodes.  Reducing "
+   "the number of nodes\n", numa_fake);
+   numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
+   printk(KERN_INFO "Number of fake nodes will be = %d\n",
+   numa_fake);
+   sz = FAKE_NODE_MIN_SIZE;
+   }
+   /*
+* Find out how many nodes can get an extra NODE_MIN_SIZE granule.
+* This logic ensures the extra memory gets distributed among as many
+* nodes as possible (as compared to one single node getting all that
+* extra memory.
+*/
+   big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
+   printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
+   "%d\n",
+   (sz >> 20), (hole_size >> 20), big);
memset(,0,sizeof(nodes));
+   end = start;
for 

[PATCH x86 for review II] [6/39] i386: romsignature/checksum cleanup

2007-02-11 Thread Andi Kleen

From: Rene Herman <[EMAIL PROTECTED]>

Use adding __init to romsignature() (it's only called from probe_roms()
which is itself __init) as an excuse to submit a pedantic cleanup.

Signed-off-by: Rene Herman <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>
Cc: Andi Kleen <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
---

 arch/i386/kernel/e820.c |   17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

Index: linux/arch/i386/kernel/e820.c
===
--- linux.orig/arch/i386/kernel/e820.c
+++ linux/arch/i386/kernel/e820.c
@@ -157,21 +157,22 @@ static struct resource standard_io_resou
.flags  = IORESOURCE_BUSY | IORESOURCE_IO
 } };
 
-static int romsignature(const unsigned char *x)
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
 {
unsigned short sig;
-   int ret = 0;
-   if (probe_kernel_address((const unsigned short *)x, sig) == 0)
-   ret = (sig == 0xaa55);
-   return ret;
+
+   return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
+  sig == ROMSIGNATURE;
 }
 
 static int __init romchecksum(unsigned char *rom, unsigned long length)
 {
-   unsigned char *p, sum = 0;
+   unsigned char sum;
 
-   for (p = rom; p < rom + length; p++)
-   sum += *p;
+   for (sum = 0; length; length--)
+   sum += *rom++;
return sum == 0;
 }
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [26/39] i386: fix 32-bit ioctls on x64_32

2007-02-11 Thread Andi Kleen

From: Giuliano Procida <[EMAIL PROTECTED]>
[MTRR] fix 32-bit ioctls on x64_32

Signed-off-by: Giuliano Procida <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---

Fixed incomplete support for 32-bit compatibility ioctls in
2.6.19.1. They were unhandled in one of three case-statements.
Testing using X server before and after change.

---
 arch/i386/kernel/cpu/mtrr/if.c |   30 ++
 1 file changed, 30 insertions(+)

Index: linux/arch/i386/kernel/cpu/mtrr/if.c
===
--- linux.orig/arch/i386/kernel/cpu/mtrr/if.c
+++ linux/arch/i386/kernel/cpu/mtrr/if.c
@@ -211,6 +211,9 @@ mtrr_ioctl(struct file *file, unsigned i
default:
return -ENOTTY;
case MTRRIOC_ADD_ENTRY:
+#ifdef CONFIG_COMPAT
+   case MTRRIOC32_ADD_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err =
@@ -218,21 +221,33 @@ mtrr_ioctl(struct file *file, unsigned i
  file, 0);
break;
case MTRRIOC_SET_ENTRY:
+#ifdef CONFIG_COMPAT
+   case MTRRIOC32_SET_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
break;
case MTRRIOC_DEL_ENTRY:
+#ifdef CONFIG_COMPAT
+   case MTRRIOC32_DEL_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
+#ifdef CONFIG_COMPAT
+   case MTRRIOC32_KILL_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
+#ifdef CONFIG_COMPAT
+   case MTRRIOC32_GET_ENTRY:
+#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
mtrr_if->get(gentry.regnum, , , );
@@ -249,6 +264,9 @@ mtrr_ioctl(struct file *file, unsigned i
 
break;
case MTRRIOC_ADD_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+   case MTRRIOC32_ADD_PAGE_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err =
@@ -256,21 +274,33 @@ mtrr_ioctl(struct file *file, unsigned i
  file, 1);
break;
case MTRRIOC_SET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+   case MTRRIOC32_SET_PAGE_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
break;
case MTRRIOC_DEL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+   case MTRRIOC32_DEL_PAGE_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+   case MTRRIOC32_KILL_PAGE_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+   case MTRRIOC32_GET_PAGE_ENTRY:
+#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
mtrr_if->get(gentry.regnum, , , );
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [33/39] x86_64: Fix off by one error in IOMMU boundary checking

2007-02-11 Thread Andi Kleen

Should be harmless because there is normally no memory there, but
technically it was incorrect.

Pointed out by Leo Duran

Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/x86_64/kernel/pci-gart.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/kernel/pci-gart.c
===
--- linux.orig/arch/x86_64/kernel/pci-gart.c
+++ linux/arch/x86_64/kernel/pci-gart.c
@@ -185,7 +185,7 @@ static void iommu_full(struct device *de
 static inline int need_iommu(struct device *dev, unsigned long addr, size_t 
size)
 { 
u64 mask = *dev->dma_mask;
-   int high = addr + size >= mask;
+   int high = addr + size > mask;
int mmu = high;
if (force_iommu) 
mmu = 1; 
@@ -195,7 +195,7 @@ static inline int need_iommu(struct devi
 static inline int nonforced_iommu(struct device *dev, unsigned long addr, 
size_t size)
 { 
u64 mask = *dev->dma_mask;
-   int high = addr + size >= mask;
+   int high = addr + size > mask;
int mmu = high;
return mmu; 
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [14/39] x86_64: cleanup Doc/x86_64/ files

2007-02-11 Thread Andi Kleen

From: Randy Dunlap <[EMAIL PROTECTED]>

Fix typos.
Lots of whitespace changes for readability and consistency.

Signed-off-by: Randy Dunlap <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 Documentation/x86_64/boot-options.txt |   27 ++-
 Documentation/x86_64/cpu-hotplug-spec |2 +-
 Documentation/x86_64/kernel-stacks|   26 +-
 Documentation/x86_64/mm.txt   |   22 +++---
 4 files changed, 35 insertions(+), 42 deletions(-)

Index: linux/Documentation/x86_64/cpu-hotplug-spec
===
--- linux.orig/Documentation/x86_64/cpu-hotplug-spec
+++ linux/Documentation/x86_64/cpu-hotplug-spec
@@ -2,7 +2,7 @@ Firmware support for CPU hotplug under L
 ---
 
 Linux/x86-64 supports CPU hotplug now. For various reasons Linux wants to
-know in advance boot time the maximum number of CPUs that could be plugged
+know in advance of boot time the maximum number of CPUs that could be plugged
 into the system. ACPI 3.0 currently has no official way to supply
 this information from the firmware to the operating system.
 
Index: linux/Documentation/x86_64/kernel-stacks
===
--- linux.orig/Documentation/x86_64/kernel-stacks
+++ linux/Documentation/x86_64/kernel-stacks
@@ -9,9 +9,9 @@ zombie. While the thread is in user spac
 except for the thread_info structure at the bottom.
 
 In addition to the per thread stacks, there are specialized stacks
-associated with each cpu.  These stacks are only used while the kernel
-is in control on that cpu, when a cpu returns to user space the
-specialized stacks contain no useful data.  The main cpu stacks is
+associated with each CPU.  These stacks are only used while the kernel
+is in control on that CPU; when a CPU returns to user space the
+specialized stacks contain no useful data.  The main CPU stacks are:
 
 * Interrupt stack.  IRQSTACKSIZE
 
@@ -32,17 +32,17 @@ x86_64 also has a feature which is not a
 to automatically switch to a new stack for designated events such as
 double fault or NMI, which makes it easier to handle these unusual
 events on x86_64.  This feature is called the Interrupt Stack Table
-(IST).  There can be up to 7 IST entries per cpu. The IST code is an
-index into the Task State Segment (TSS), the IST entries in the TSS
-point to dedicated stacks, each stack can be a different size.
+(IST).  There can be up to 7 IST entries per CPU. The IST code is an
+index into the Task State Segment (TSS). The IST entries in the TSS
+point to dedicated stacks; each stack can be a different size.
 
-An IST is selected by an non-zero value in the IST field of an
+An IST is selected by a non-zero value in the IST field of an
 interrupt-gate descriptor.  When an interrupt occurs and the hardware
 loads such a descriptor, the hardware automatically sets the new stack
 pointer based on the IST value, then invokes the interrupt handler.  If
 software wants to allow nested IST interrupts then the handler must
 adjust the IST values on entry to and exit from the interrupt handler.
-(this is occasionally done, e.g. for debug exceptions)
+(This is occasionally done, e.g. for debug exceptions.)
 
 Events with different IST codes (i.e. with different stacks) can be
 nested.  For example, a debug interrupt can safely be interrupted by an
@@ -58,17 +58,17 @@ The currently assigned IST stacks are :-
 
   Used for interrupt 12 - Stack Fault Exception (#SS).
 
-  This allows to recover from invalid stack segments. Rarely
+  This allows the CPU to recover from invalid stack segments. Rarely
   happens.
 
 * DOUBLEFAULT_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
 
   Used for interrupt 8 - Double Fault Exception (#DF).
 
-  Invoked when handling a exception causes another exception. Happens
-  when the kernel is very confused (e.g. kernel stack pointer corrupt)
-  Using a separate stack allows to recover from it well enough in many
-  cases to still output an oops.
+  Invoked when handling one exception causes another exception. Happens
+  when the kernel is very confused (e.g. kernel stack pointer corrupt).
+  Using a separate stack allows the kernel to recover from it well enough
+  in many cases to still output an oops.
 
 * NMI_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
 
Index: linux/Documentation/x86_64/mm.txt
===
--- linux.orig/Documentation/x86_64/mm.txt
+++ linux/Documentation/x86_64/mm.txt
@@ -3,26 +3,26 @@
 
 Virtual memory map with 4 level page tables:
 
- - 7fff (=47bits) user space, different per mm
+ - 7fff (=47 bits) user space, different per mm
 hole caused by [48:63] sign extension
-8000 - 80ff (=40bits) guard hole
-8100 - c0ff (=46bits) direct mapping of all 

[PATCH x86 for review II] [32/39] x86_64: x86_64 - Fix FS/GS registers for VT execution

2007-02-11 Thread Andi Kleen

From: Zachary Amsden <[EMAIL PROTECTED]>

Initialize FS and GS to __KERNEL_DS as well.  The actual value of them is not
important, but it is important to reload them in protected mode.  At this time,
they still retain the real mode values from initial boot.  VT disallows
execution of code under such conditions, which means hardware virtualization
can not be used to boot the kernel on Intel platforms, making the boot time
painfully slow.

This requires moving the GS load before the load of GS_BASE, so just move
all the segments loads there to keep them together in the code.

Signed-off-by: Zachary Amsden <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/x86_64/kernel/head.S |   20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

Index: linux/arch/x86_64/kernel/head.S
===
--- linux.orig/arch/x86_64/kernel/head.S
+++ linux/arch/x86_64/kernel/head.S
@@ -163,6 +163,20 @@ startup_64:
 */
lgdtcpu_gdt_descr
 
+   /* set up data segments. actually 0 would do too */
+   movl $__KERNEL_DS,%eax
+   movl %eax,%ds
+   movl %eax,%ss
+   movl %eax,%es
+
+   /*
+* We don't really need to load %fs or %gs, but load them anyway
+* to kill any stale realmode selectors.  This allows execution
+* under VT hardware.
+*/
+   movl %eax,%fs
+   movl %eax,%gs
+
/* 
 * Setup up a dummy PDA. this is just for some early bootup code
 * that does in_interrupt() 
@@ -173,12 +187,6 @@ startup_64:
shrq$32,%rdx
wrmsr   
 
-   /* set up data segments. actually 0 would do too */
-   movl $__KERNEL_DS,%eax
-   movl %eax,%ds   
-   movl %eax,%ss
-   movl %eax,%es
-   
/* esi is pointer to real mode structure with interesting info.
   pass it to C */
movl%esi, %edi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [31/39] x86_64: Unexport __supported_pte_mask

2007-02-11 Thread Andi Kleen

The symbol is needed to manipulate page tables, and modules shouldn't
do that.

Leftover from 2.4, but no in tree module should need it now.

Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/x86_64/kernel/setup64.c |1 -
 1 file changed, 1 deletion(-)

Index: linux/arch/x86_64/kernel/setup64.c
===
--- linux.orig/arch/x86_64/kernel/setup64.c
+++ linux/arch/x86_64/kernel/setup64.c
@@ -37,7 +37,6 @@ struct desc_ptr idt_descr = { 256 * 16 -
 char boot_cpu_stack[IRQSTACKSIZE] 
__attribute__((section(".bss.page_aligned")));
 
 unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL(__supported_pte_mask);
 static int do_not_nx __cpuinitdata = 0;
 
 /* noexec=on|off
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [25/39] x86_64: Fix preprocessor condition

2007-02-11 Thread Andi Kleen

From: "Josef 'Jeff' Sipek" <[EMAIL PROTECTED]>
Signed-off-by: Josef 'Jeff' Sipek <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 include/asm-x86_64/io.h |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux/include/asm-x86_64/io.h
===
--- linux.orig/include/asm-x86_64/io.h
+++ linux/include/asm-x86_64/io.h
@@ -100,7 +100,7 @@ __OUTS(l)
 
 #define IO_SPACE_LIMIT 0x
 
-#if defined(__KERNEL__) && __x86_64__
+#if defined(__KERNEL__) && defined(__x86_64__)
 
 #include 
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [27/39] i386: APM on i386

2007-02-11 Thread Andi Kleen

From: Alexey Dobriyan <[EMAIL PROTECTED]>
Byte-to-byte identical /proc/apm here.

Signed-off-by: Alexey Dobriyan <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---

 arch/i386/kernel/apm.c |   26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

Index: linux/arch/i386/kernel/apm.c
===
--- linux.orig/arch/i386/kernel/apm.c
+++ linux/arch/i386/kernel/apm.c
@@ -211,6 +211,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1636,9 +1637,8 @@ static int do_open(struct inode * inode,
return 0;
 }
 
-static int apm_get_info(char *buf, char **start, off_t fpos, int length)
+static int proc_apm_show(struct seq_file *m, void *v)
 {
-   char *  p;
unsigned short  bx;
unsigned short  cx;
unsigned short  dx;
@@ -1650,8 +1650,6 @@ static int apm_get_info(char *buf, char 
int time_units = -1;
char*units = "?";
 
-   p = buf;
-
if ((num_online_cpus() == 1) &&
!(error = apm_get_power_status(, , ))) {
ac_line_status = (bx >> 8) & 0xff;
@@ -1705,7 +1703,7 @@ static int apm_get_info(char *buf, char 
  -1: Unknown
   8) min = minutes; sec = seconds */
 
-   p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
+   seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
 driver_version,
 (apm_info.bios.version >> 8) & 0xff,
 apm_info.bios.version & 0xff,
@@ -1716,10 +1714,22 @@ static int apm_get_info(char *buf, char 
 percentage,
 time_units,
 units);
+   return 0;
+}
 
-   return p - buf;
+static int proc_apm_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, proc_apm_show, NULL);
 }
 
+static const struct file_operations apm_file_ops = {
+   .owner  = THIS_MODULE,
+   .open   = proc_apm_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= single_release,
+};
+
 static int apm(void *unused)
 {
unsigned short  bx;
@@ -2341,9 +2351,9 @@ static int __init apm_init(void)
set_base(gdt[APM_DS >> 3],
 __va((unsigned long)apm_info.bios.dseg << 4));
 
-   apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info);
+   apm_proc = create_proc_entry("apm", 0, NULL);
if (apm_proc)
-   apm_proc->owner = THIS_MODULE;
+   apm_proc->proc_fops = _file_ops;
 
kapmd_task = kthread_create(apm, NULL, "kapmd");
if (IS_ERR(kapmd_task)) {
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [24/39] i386: use smp_call_function_single()

2007-02-11 Thread Andi Kleen

From: Alexey Dobriyan <[EMAIL PROTECTED]>
It will execure cpuid only on the cpu we need.

Signed-off-by: Alexey Dobriyan <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---

 arch/i386/kernel/cpuid.c |7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

Index: linux/arch/i386/kernel/cpuid.c
===
--- linux.orig/arch/i386/kernel/cpuid.c
+++ linux/arch/i386/kernel/cpuid.c
@@ -48,7 +48,6 @@ static struct class *cpuid_class;
 #ifdef CONFIG_SMP
 
 struct cpuid_command {
-   int cpu;
u32 reg;
u32 *data;
 };
@@ -57,8 +56,7 @@ static void cpuid_smp_cpuid(void *cmd_bl
 {
struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
 
-   if (cmd->cpu == smp_processor_id())
-   cpuid(cmd->reg, >data[0], >data[1], >data[2],
+   cpuid(cmd->reg, >data[0], >data[1], >data[2],
  >data[3]);
 }
 
@@ -70,11 +68,10 @@ static inline void do_cpuid(int cpu, u32
if (cpu == smp_processor_id()) {
cpuid(reg, [0], [1], [2], [3]);
} else {
-   cmd.cpu = cpu;
cmd.reg = reg;
cmd.data = data;
 
-   smp_call_function(cpuid_smp_cpuid, , 1, 1);
+   smp_call_function_single(cpu, cpuid_smp_cpuid, , 1, 1);
}
preempt_enable();
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [20/39] i386: Small cleanup to TLB flush code

2007-02-11 Thread Andi Kleen

- Remove outdated comment
- Use cpu_relax() in a busy loop

Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/i386/kernel/smp.c |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Index: linux/arch/i386/kernel/smp.c
===
--- linux.orig/arch/i386/kernel/smp.c
+++ linux/arch/i386/kernel/smp.c
@@ -375,8 +375,7 @@ static void flush_tlb_others(cpumask_t c
/*
 * i'm not happy about this global shared spinlock in the
 * MM hot path, but we'll see how contended it is.
-* Temporarily this turns IRQs off, so that lockups are
-* detected by the NMI watchdog.
+* AK: x86-64 has a faster method that could be ported.
 */
spin_lock(_lock);

@@ -401,7 +400,7 @@ static void flush_tlb_others(cpumask_t c
 
while (!cpus_empty(flush_cpumask))
/* nothing. lockup detection does not belong here */
-   mb();
+   cpu_relax();
 
flush_mm = NULL;
flush_va = 0;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [22/39] x86_64: Kconfig typos

2007-02-11 Thread Andi Kleen

From: Nicolas Kaiser <[EMAIL PROTECTED]>
Some typos in Kconfig.

Signed-off-by: Nicolas Kaiser <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---

---
 arch/x86_64/Kconfig |   12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

Index: linux/arch/x86_64/Kconfig
===
--- linux.orig/arch/x86_64/Kconfig
+++ linux/arch/x86_64/Kconfig
@@ -148,18 +148,18 @@ config MPSC
  Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs
  with Intel Extended Memory 64 Technology(EM64T). For details see
  .
- Note the the latest Xeons (Xeon 51xx and 53xx) are not based on the
-  Netburst core and shouldn't use this option. You can distingush them
+ Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
+  Netburst core and shouldn't use this option. You can distinguish them
  using the cpu family field
- in /proc/cpuinfo. Family 15 is a older Xeon, Family 6 a newer one
- (this rule only applies to system that support EM64T)
+ in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one
+ (this rule only applies to systems that support EM64T)
 
 config MCORE2
bool "Intel Core2 / newer Xeon"
help
  Optimize for Intel Core2 and newer Xeons (51xx)
- You can distingush the newer Xeons from the older ones using
- the cpu family field in /proc/cpuinfo. 15 is a older Xeon
+ You can distinguish the newer Xeons from the older ones using
+ the cpu family field in /proc/cpuinfo. 15 is an older Xeon
  (use CONFIG_MPSC then), 6 is a newer one. This rule only
  applies to CPUs that support EM64T.
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [19/39] x86_64: remove get_pmd()

2007-02-11 Thread Andi Kleen

From: "Jan Beulich" <[EMAIL PROTECTED]>
Function is dead.

Signed-off-by: Jan Beulich <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 include/asm-x86_64/pgalloc.h |5 -
 1 file changed, 5 deletions(-)

Index: linux/include/asm-x86_64/pgalloc.h
===
--- linux.orig/include/asm-x86_64/pgalloc.h
+++ linux/include/asm-x86_64/pgalloc.h
@@ -18,11 +18,6 @@ static inline void pmd_populate(struct m
set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
 }
 
-static inline pmd_t *get_pmd(void)
-{
-   return (pmd_t *)get_zeroed_page(GFP_KERNEL);
-}
-
 static inline void pmd_free(pmd_t *pmd)
 {
BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [36/39] x86_64: define dma noncoherent API functions

2007-02-11 Thread Andi Kleen

From: Jeff Garzik <[EMAIL PROTECTED]>

x86-64 is missing these:

Signed-off-by: Jeff Garzik <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 include/asm-x86_64/dma-mapping.h |3 +++
 1 file changed, 3 insertions(+)

Index: linux/include/asm-x86_64/dma-mapping.h
===
--- linux.orig/include/asm-x86_64/dma-mapping.h
+++ linux/include/asm-x86_64/dma-mapping.h
@@ -66,6 +66,9 @@ static inline int dma_mapping_error(dma_
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 
+#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+
 extern void *dma_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp);
 extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [18/39] x86_64: Allow to run a program when a machine check event is detected

2007-02-11 Thread Andi Kleen

When a machine check event is detected (including a AMD RevF threshold 
overflow event) allow to run a "trigger" program. This allows user space
to react to such events sooner.

The trigger is configured using a new trigger entry in the 
machinecheck sysfs interface. It is currently shared between
all CPUs.

I also fixed the AMD threshold handler to run the machine 
check polling code immediately to actually log any events
that might have caused the threshold interrupt.

Also added some documentation for the mce sysfs interface.

Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 Documentation/x86_64/machinecheck |   70 ++
 arch/x86_64/kernel/mce.c  |   66 +--
 arch/x86_64/kernel/mce_amd.c  |4 ++
 include/asm-x86_64/mce.h  |2 +
 kernel/kmod.c |   44 ---
 5 files changed, 160 insertions(+), 26 deletions(-)

Index: linux/arch/x86_64/kernel/mce.c
===
--- linux.orig/arch/x86_64/kernel/mce.c
+++ linux/arch/x86_64/kernel/mce.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include  
 #include 
 #include 
@@ -42,6 +43,10 @@ static unsigned long console_logged;
 static int notify_user;
 static int rip_msr;
 static int mce_bootlog = 1;
+static atomic_t mce_events;
+
+static char trigger[128];
+static char *trigger_argv[2] = { trigger, NULL };
 
 /*
  * Lockless MCE logging infrastructure.
@@ -57,6 +62,7 @@ struct mce_log mcelog = { 
 void mce_log(struct mce *mce)
 {
unsigned next, entry;
+   atomic_inc(_events);
mce->finished = 0;
wmb();
for (;;) {
@@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mc
}
 }
 
+static void do_mce_trigger(void)
+{
+   static atomic_t mce_logged;
+   int events = atomic_read(_events);
+   if (events != atomic_read(_logged) && trigger[0]) {
+   /* Small race window, but should be harmless.  */
+   atomic_set(_logged, events);
+   call_usermodehelper(trigger, trigger_argv, NULL, -1);
+   }
+}
+
 /* 
  * The actual machine check handler
  */
@@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * r
}
 
/* Never do anything final in the polling timer */
-   if (!regs)
+   if (!regs) {
+   /* Normal interrupt context here. Call trigger for any new
+  events. */
+   do_mce_trigger();
goto out;
+   }
 
/* If we didn't find an uncorrectable error, pick
   the last one (shouldn't happen, just being safe). */
@@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device
}   
   \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
+/* TBD should generate these dynamically based on number of available banks */
 ACCESSOR(bank0ctl,bank[0],mce_restart())
 ACCESSOR(bank1ctl,bank[1],mce_restart())
 ACCESSOR(bank2ctl,bank[2],mce_restart())
 ACCESSOR(bank3ctl,bank[3],mce_restart())
 ACCESSOR(bank4ctl,bank[4],mce_restart())
 ACCESSOR(bank5ctl,bank[5],mce_restart())
-static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
-   _bank0ctl, _bank1ctl, _bank2ctl,
-   _bank3ctl, _bank4ctl, _bank5ctl};
+
+static ssize_t show_trigger(struct sys_device *s, char *buf)
+{
+   strcpy(buf, trigger);
+   strcat(buf, "\n");
+   return strlen(trigger) + 1;
+}
+
+static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+{
+   char *p;
+   int len;
+   strncpy(trigger, buf, sizeof(trigger));
+   trigger[sizeof(trigger)-1] = 0;
+   len = strlen(trigger);
+   p = strchr(trigger, '\n');
+   if (*p) *p = 0;
+   return len;
+}
+
+static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 ACCESSOR(tolerant,tolerant,)
 ACCESSOR(check_interval,check_interval,mce_restart())
+static struct sysdev_attribute *mce_attributes[] = {
+   _bank0ctl, _bank1ctl, _bank2ctl,
+   _bank3ctl, _bank4ctl, _bank5ctl,
+   _tolerant, _check_interval, _trigger,
+   NULL
+};
 
 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 static __cpuinit int mce_create_device(unsigned int cpu)
@@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(u
err = sysdev_register(_cpu(device_mce,cpu));
 
if (!err) {
-   for (i = 0; i < banks; i++)
+   for (i = 0; mce_attributes[i]; i++)
sysdev_create_file(_cpu(device_mce,cpu),
-   bank_attributes[i]);
-   sysdev_create_file(_cpu(device_mce,cpu), _tolerant);
-   sysdev_create_file(_cpu(device_mce,cpu), 
_check_interval);
+   mce_attributes[i]);
}
return err;
 }
@@ -645,11 +689,9 @@ static void mce_remove_device(unsigned i
 {
   

[PATCH x86 for review II] [35/39] x86_64: Don't reserve ROMs

2007-02-11 Thread Andi Kleen

We trust the e820 table, so explicitely reserving ROMs shouldn't 
be needed.

Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/x86_64/kernel/setup.c |  130 -
 1 file changed, 2 insertions(+), 128 deletions(-)

Index: linux/arch/x86_64/kernel/setup.c
===
--- linux.orig/arch/x86_64/kernel/setup.c
+++ linux/arch/x86_64/kernel/setup.c
@@ -138,128 +138,6 @@ struct resource code_resource = {
.flags = IORESOURCE_RAM,
 };
 
-#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
-
-static struct resource system_rom_resource = {
-   .name = "System ROM",
-   .start = 0xf,
-   .end = 0xf,
-   .flags = IORESOURCE_ROM,
-};
-
-static struct resource extension_rom_resource = {
-   .name = "Extension ROM",
-   .start = 0xe,
-   .end = 0xe,
-   .flags = IORESOURCE_ROM,
-};
-
-static struct resource adapter_rom_resources[] = {
-   { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
-   .flags = IORESOURCE_ROM },
-   { .name = "Adapter ROM", .start = 0, .end = 0,
-   .flags = IORESOURCE_ROM },
-   { .name = "Adapter ROM", .start = 0, .end = 0,
-   .flags = IORESOURCE_ROM },
-   { .name = "Adapter ROM", .start = 0, .end = 0,
-   .flags = IORESOURCE_ROM },
-   { .name = "Adapter ROM", .start = 0, .end = 0,
-   .flags = IORESOURCE_ROM },
-   { .name = "Adapter ROM", .start = 0, .end = 0,
-   .flags = IORESOURCE_ROM }
-};
-
-static struct resource video_rom_resource = {
-   .name = "Video ROM",
-   .start = 0xc,
-   .end = 0xc7fff,
-   .flags = IORESOURCE_ROM,
-};
-
-static struct resource video_ram_resource = {
-   .name = "Video RAM area",
-   .start = 0xa,
-   .end = 0xb,
-   .flags = IORESOURCE_RAM,
-};
-
-#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
-
-static int __init romchecksum(unsigned char *rom, unsigned long length)
-{
-   unsigned char *p, sum = 0;
-
-   for (p = rom; p < rom + length; p++)
-   sum += *p;
-   return sum == 0;
-}
-
-static void __init probe_roms(void)
-{
-   unsigned long start, length, upper;
-   unsigned char *rom;
-   int   i;
-
-   /* video rom */
-   upper = adapter_rom_resources[0].start;
-   for (start = video_rom_resource.start; start < upper; start += 2048) {
-   rom = isa_bus_to_virt(start);
-   if (!romsignature(rom))
-   continue;
-
-   video_rom_resource.start = start;
-
-   /* 0 < length <= 0x7f * 512, historically */
-   length = rom[2] * 512;
-
-   /* if checksum okay, trust length byte */
-   if (length && romchecksum(rom, length))
-   video_rom_resource.end = start + length - 1;
-
-   request_resource(_resource, _rom_resource);
-   break;
-   }
-
-   start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
-   if (start < upper)
-   start = upper;
-
-   /* system rom */
-   request_resource(_resource, _rom_resource);
-   upper = system_rom_resource.start;
-
-   /* check for extension rom (ignore length byte!) */
-   rom = isa_bus_to_virt(extension_rom_resource.start);
-   if (romsignature(rom)) {
-   length = extension_rom_resource.end - 
extension_rom_resource.start + 1;
-   if (romchecksum(rom, length)) {
-   request_resource(_resource, 
_rom_resource);
-   upper = extension_rom_resource.start;
-   }
-   }
-
-   /* check for adapter roms on 2k boundaries */
-   for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
-start += 2048) {
-   rom = isa_bus_to_virt(start);
-   if (!romsignature(rom))
-   continue;
-
-   /* 0 < length <= 0x7f * 512, historically */
-   length = rom[2] * 512;
-
-   /* but accept any length that fits if checksum okay */
-   if (!length || start + length > upper || !romchecksum(rom, 
length))
-   continue;
-
-   adapter_rom_resources[i].start = start;
-   adapter_rom_resources[i].end = start + length - 1;
-   request_resource(_resource, _rom_resources[i]);
-
-   start = adapter_rom_resources[i++].end & ~2047UL;
-   }
-}
-
 #ifdef CONFIG_PROC_VMCORE
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel. This option will be passed
@@ -524,15 +402,11 @@ void __init setup_arch(char **cmdline_p)
init_apic_mappings();
 
/*
-* Request address space for all standard RAM and ROM resources
-* and also for regions 

[PATCH x86 for review II] [39/39] i386: All Transmeta CPUs have constant TSCs

2007-02-11 Thread Andi Kleen

From: "H. Peter Anvin" <[EMAIL PROTECTED]>

All Transmeta CPUs ever produced have constant-rate TSCs.

Signed-off-by: H. Peter Anvin <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>
Cc: Andi Kleen <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
---

 arch/i386/kernel/cpu/transmeta.c |3 +++
 1 file changed, 3 insertions(+)

Index: linux/arch/i386/kernel/cpu/transmeta.c
===
--- linux.orig/arch/i386/kernel/cpu/transmeta.c
+++ linux/arch/i386/kernel/cpu/transmeta.c
@@ -72,6 +72,9 @@ static void __cpuinit init_transmeta(str
wrmsr(0x80860004, ~0, uk);
c->x86_capability[0] = cpuid_edx(0x0001);
wrmsr(0x80860004, cap_mask, uk);
+
+   /* All Transmeta CPUs have a constant TSC */
+   set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);

/* If we can run i686 user-space code, call us an i686 */
 #define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [34/39] i386: Use stack arguments for calling into EFI

2007-02-11 Thread Andi Kleen

When calling into the EFI firmware, the parameters need to be passed on
the stack. The recent change to use -mregparm=3 breaks x86 EFI support.
This patch is needed to allow the new Intel-based Macs to suspend to ram
(efi.get_time is called during the suspend phase).

Signed-off-by: Frederic Riss <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 include/linux/efi.h |   43 +++
 1 file changed, 27 insertions(+), 16 deletions(-)

Index: linux/include/linux/efi.h
===
--- linux.orig/include/linux/efi.h
+++ linux/include/linux/efi.h
@@ -157,22 +157,33 @@ typedef struct {
unsigned long reset_system;
 } efi_runtime_services_t;
 
-typedef efi_status_t efi_get_time_t (efi_time_t *tm, efi_time_cap_t *tc);
-typedef efi_status_t efi_set_time_t (efi_time_t *tm);
-typedef efi_status_t efi_get_wakeup_time_t (efi_bool_t *enabled, efi_bool_t 
*pending,
-   efi_time_t *tm);
-typedef efi_status_t efi_set_wakeup_time_t (efi_bool_t enabled, efi_time_t 
*tm);
-typedef efi_status_t efi_get_variable_t (efi_char16_t *name, efi_guid_t 
*vendor, u32 *attr,
-unsigned long *data_size, void *data);
-typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, 
efi_char16_t *name,
- efi_guid_t *vendor);
-typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t 
*vendor, 
-unsigned long attr, unsigned long 
data_size, 
-void *data);
-typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count);
-typedef void efi_reset_system_t (int reset_type, efi_status_t status,
-unsigned long data_size, efi_char16_t *data);
-typedef efi_status_t efi_set_virtual_address_map_t (unsigned long 
memory_map_size,
+typedef asmlinkage efi_status_t efi_get_time_t (efi_time_t *tm,
+   efi_time_cap_t *tc);
+typedef asmlinkage efi_status_t efi_set_time_t (efi_time_t *tm);
+typedef asmlinkage efi_status_t efi_get_wakeup_time_t (efi_bool_t *enabled,
+  efi_bool_t *pending,
+  efi_time_t *tm);
+typedef asmlinkage efi_status_t efi_set_wakeup_time_t (efi_bool_t enabled,
+  efi_time_t *tm);
+typedef asmlinkage efi_status_t efi_get_variable_t (efi_char16_t *name,
+   efi_guid_t *vendor,
+   u32 *attr,
+   unsigned long *data_size,
+   void *data);
+typedef asmlinkage efi_status_t efi_get_next_variable_t (unsigned long 
*name_sz,
+efi_char16_t *name,
+efi_guid_t *vendor);
+typedef asmlinkage efi_status_t efi_set_variable_t (efi_char16_t *name,
+   efi_guid_t *vendor,
+   unsigned long attr,
+   unsigned long data_size,
+   void *data);
+typedef asmlinkage efi_status_t efi_get_next_high_mono_count_t (u32 *count);
+typedef asmlinkage void efi_reset_system_t (int reset_type,
+   efi_status_t status,
+   unsigned long data_size,
+   efi_char16_t *data);
+typedef asmlinkage efi_status_t efi_set_virtual_address_map_t (unsigned long 
memory_map_size,
unsigned long descriptor_size,
u32 descriptor_version,
efi_memory_desc_t *virtual_map);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [17/39] x86_64: Tighten mce_amd driver MSR reads

2007-02-11 Thread Andi Kleen

From: "Jan Beulich" <[EMAIL PROTECTED]>

while debugging an unrelated problem in Xen, I noticed odd reads from
non-existent MSRs. Having now found time to look why these happen, I
came up with below patch, which
- prevents accessing MCi_MISCj with j > 0 when the block pointer in
MCi_MISC0 is zero
- accesses only contiguous MCi_MISCj until a non-implemented one is
found
- doesn't touch unimplemented blocks in mce_threshold_interrupt at all
- gives names to two bits previously derived from MASK_VALID_HI (it
took me some time to understand the code without this)

The first three items, besides being apparently closer to the spec, should
namely help cutting down on the time mce_threshold_interrupt() takes.

Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/x86_64/kernel/mce_amd.c |   40 +---
 1 file changed, 25 insertions(+), 15 deletions(-)

Index: linux/arch/x86_64/kernel/mce_amd.c
===
--- linux.orig/arch/x86_64/kernel/mce_amd.c
+++ linux/arch/x86_64/kernel/mce_amd.c
@@ -37,6 +37,8 @@
 #define THRESHOLD_MAX 0xFFF
 #define INT_TYPE_APIC 0x0002
 #define MASK_VALID_HI 0x8000
+#define MASK_CNTP_HI  0x4000
+#define MASK_LOCKED_HI0x2000
 #define MASK_LVTOFF_HI0x00F0
 #define MASK_COUNT_EN_HI  0x0008
 #define MASK_INT_TYPE_HI  0x0006
@@ -122,14 +124,17 @@ void __cpuinit mce_amd_feature_init(stru
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
-   else if (block == 1)
-   address = MCG_XBLK_ADDR
-   + ((low & MASK_BLKPTR_LO) >> 21);
+   else if (block == 1) {
+   address = (low & MASK_BLKPTR_LO) >> 21;
+   if (!address)
+   break;
+   address += MCG_XBLK_ADDR;
+   }
else
++address;
 
if (rdmsr_safe(address, , ))
-   continue;
+   break;
 
if (!(high & MASK_VALID_HI)) {
if (block)
@@ -138,8 +143,8 @@ void __cpuinit mce_amd_feature_init(stru
break;
}
 
-   if (!(high & MASK_VALID_HI >> 1)  ||
-(high & MASK_VALID_HI >> 2))
+   if (!(high & MASK_CNTP_HI)  ||
+(high & MASK_LOCKED_HI))
continue;
 
if (!block)
@@ -187,17 +192,22 @@ asmlinkage void mce_threshold_interrupt(
 
/* assume first bank caused it */
for (bank = 0; bank < NR_BANKS; ++bank) {
+   if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
+   continue;
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
-   else if (block == 1)
-   address = MCG_XBLK_ADDR
-   + ((low & MASK_BLKPTR_LO) >> 21);
+   else if (block == 1) {
+   address = (low & MASK_BLKPTR_LO) >> 21;
+   if (!address)
+   break;
+   address += MCG_XBLK_ADDR;
+   }
else
++address;
 
if (rdmsr_safe(address, , ))
-   continue;
+   break;
 
if (!(high & MASK_VALID_HI)) {
if (block)
@@ -206,8 +216,8 @@ asmlinkage void mce_threshold_interrupt(
break;
}
 
-   if (!(high & MASK_VALID_HI >> 1)  ||
-(high & MASK_VALID_HI >> 2))
+   if (!(high & MASK_CNTP_HI)  ||
+(high & MASK_LOCKED_HI))
continue;
 
if (high & MASK_OVERFLOW_HI) {
@@ -385,7 +395,7 @@ static __cpuinit int allocate_threshold_
return 0;
 
if (rdmsr_safe(address, , ))
-   goto recurse;
+   return 0;
 
if (!(high & MASK_VALID_HI)) {
if (block)
@@ -394,8 +404,8 @@ static __cpuinit int allocate_threshold_
return 0;
}
 
-   if (!(high & MASK_VALID_HI >> 1)  ||
-(high & MASK_VALID_HI >> 2))
+   if (!(high 

[PATCH x86 for review II] [37/39] x86_64: robustify bad_dma_address handling

2007-02-11 Thread Andi Kleen

From: Muli Ben-Yehuda <[EMAIL PROTECTED]>

- set bad_dma_address explicitly to 0x0
- reserve 32 pages from bad_dma_address and up
- WARN_ON() a driver feeding us bad_dma_address

Thanks to Leo Duran <[EMAIL PROTECTED]> for the suggestion.

Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>
Cc: Leo Duran <[EMAIL PROTECTED]>
Cc: Job Mason <[EMAIL PROTECTED]>
---
 arch/x86_64/kernel/pci-calgary.c |   17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/kernel/pci-calgary.c
===
--- linux.orig/arch/x86_64/kernel/pci-calgary.c
+++ linux/arch/x86_64/kernel/pci-calgary.c
@@ -138,6 +138,8 @@ static const unsigned long phb_debug_off
 
 #define PHB_DEBUG_STUFF_OFFSET 0x0020
 
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
 unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
 static int translate_empty_slots __read_mostly = 0;
 static int calgary_detected __read_mostly = 0;
@@ -296,6 +298,16 @@ static void __iommu_free(struct iommu_ta
 {
unsigned long entry;
unsigned long badbit;
+   unsigned long badend;
+
+   /* were we called with bad_dma_address? */
+   badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
+   if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
+   printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
+  "address 0x%Lx\n", dma_addr);
+   WARN_ON(1);
+   return;
+   }
 
entry = dma_addr >> PAGE_SHIFT;
 
@@ -656,8 +668,8 @@ static void __init calgary_reserve_regio
u64 start;
struct iommu_table *tbl = dev->sysdata;
 
-   /* reserve bad_dma_address in case it's a legal address */
-   iommu_range_reserve(tbl, bad_dma_address, 1);
+   /* reserve EMERGENCY_PAGES from bad_dma_address and up */
+   iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
 
/* avoid the BIOS/VGA first 640KB-1MB region */
start = (640 * 1024);
@@ -1176,6 +1188,7 @@ int __init calgary_iommu_init(void)
}
 
force_iommu = 1;
+   bad_dma_address = 0x0;
dma_ops = _dma_ops;
 
return 0;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [16/39] x86: simplify notify_page_fault()

2007-02-11 Thread Andi Kleen

From: "Jan Beulich" <[EMAIL PROTECTED]>
Remove all parameters from this function that aren't really variable.

Signed-off-by: Jan Beulich <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/i386/mm/fault.c   |   18 --
 arch/x86_64/mm/fault.c |   18 --
 2 files changed, 16 insertions(+), 20 deletions(-)

Index: linux/arch/i386/mm/fault.c
===
--- linux.orig/arch/i386/mm/fault.c
+++ linux/arch/i386/mm/fault.c
@@ -46,17 +46,17 @@ int unregister_page_fault_notifier(struc
 }
 EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
 
-static inline int notify_page_fault(enum die_val val, const char *str,
-   struct pt_regs *regs, long err, int trap, int sig)
+static inline int notify_page_fault(struct pt_regs *regs, long err)
 {
struct die_args args = {
.regs = regs,
-   .str = str,
+   .str = "page fault",
.err = err,
-   .trapnr = trap,
-   .signr = sig
+   .trapnr = 14,
+   .signr = SIGSEGV
};
-   return atomic_notifier_call_chain(_page_fault_chain, val, );
+   return atomic_notifier_call_chain(_page_fault_chain,
+ DIE_PAGE_FAULT, );
 }
 
 /*
@@ -353,8 +353,7 @@ fastcall void __kprobes do_page_fault(st
if (unlikely(address >= TASK_SIZE)) {
if (!(error_code & 0x000d) && vmalloc_fault(address) >= 0)
return;
-   if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, 
error_code, 14,
-   SIGSEGV) == NOTIFY_STOP)
+   if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
return;
/*
 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -363,8 +362,7 @@ fastcall void __kprobes do_page_fault(st
goto bad_area_nosemaphore;
}
 
-   if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 
14,
-   SIGSEGV) == NOTIFY_STOP)
+   if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
return;
 
/* It's safe to allow irq's after cr2 has been saved and the vmalloc
Index: linux/arch/x86_64/mm/fault.c
===
--- linux.orig/arch/x86_64/mm/fault.c
+++ linux/arch/x86_64/mm/fault.c
@@ -56,17 +56,17 @@ int unregister_page_fault_notifier(struc
 }
 EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
 
-static inline int notify_page_fault(enum die_val val, const char *str,
-   struct pt_regs *regs, long err, int trap, int sig)
+static inline int notify_page_fault(struct pt_regs *regs, long err)
 {
struct die_args args = {
.regs = regs,
-   .str = str,
+   .str = "page fault",
.err = err,
-   .trapnr = trap,
-   .signr = sig
+   .trapnr = 14,
+   .signr = SIGSEGV
};
-   return atomic_notifier_call_chain(_page_fault_chain, val, );
+   return atomic_notifier_call_chain(_page_fault_chain,
+ DIE_PAGE_FAULT, );
 }
 
 void bust_spinlocks(int yes)
@@ -376,8 +376,7 @@ asmlinkage void __kprobes do_page_fault(
if (vmalloc_fault(address) >= 0)
return;
}
-   if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, 
error_code, 14,
-   SIGSEGV) == NOTIFY_STOP)
+   if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
return;
/*
 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -386,8 +385,7 @@ asmlinkage void __kprobes do_page_fault(
goto bad_area_nosemaphore;
}
 
-   if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 
14,
-   SIGSEGV) == NOTIFY_STOP)
+   if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
return;
 
if (likely(regs->eflags & X86_EFLAGS_IF))
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [38/39] x86: fix laptop bootup hang in init_acpi()

2007-02-11 Thread Andi Kleen

From: Ingo Molnar <[EMAIL PROTECTED]>

During kernel bootup, a new T60 laptop (CoreDuo, 32-bit) hangs about
10%-20% of the time in acpi_init():

 Calling initcall 0xc055ce1a: topology_init+0x0/0x2f()
 Calling initcall 0xc055d75e: mtrr_init_finialize+0x0/0x2c()
 Calling initcall 0xc05664f3: param_sysfs_init+0x0/0x175()
 Calling initcall 0xc014cb65: pm_sysrq_init+0x0/0x17()
 Calling initcall 0xc0569f99: init_bio+0x0/0xf4()
 Calling initcall 0xc056b865: genhd_device_init+0x0/0x50()
 Calling initcall 0xc056c4bd: fbmem_init+0x0/0x87()
 Calling initcall 0xc056dd74: acpi_init+0x0/0x1ee()

It's a hard hang that not even an NMI could punch through!  Frustratingly,
adding printks or function tracing to the ACPI code made the hangs go away
...

After some time an additional detail emerged: disabling the NMI watchdog
made these occasional hangs go away.

So i spent the better part of today trying to debug this and trying out
various theories when i finally found the likely reason for the hang: if
acpi_ns_initialize_devices() executes an _INI AML method and an NMI
happens to hit that AML execution in the wrong moment, the machine would
hang.  (my theory is that this must be some sort of chipset setup method
doing stores to chipset mmio registers?)

Unfortunately given the characteristics of the hang it was sheer
impossible to figure out which of the numerous AML methods is impacted
by this problem.

As a workaround i wrote an interface to disable chipset-based NMIs while
executing _INI sections - and indeed this fixed the hang.  I did a
boot-loop of 100 separate reboots and none hung - while without the patch
it would hang every 5-10 attempts.  Out of caution i did not touch the
nmi_watchdog=2 case (it's not related to the chipset anyway and didnt
hang).

I implemented this for both x86_64 and i686, tested the i686 laptop both
with nmi_watchdog=1 [which triggered the hangs] and nmi_watchdog=2, and
tested an Athlon64 box with the 64-bit kernel as well. Everything builds
and works with the patch applied.

Signed-off-by: Ingo Molnar <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>
Cc: Andi Kleen <[EMAIL PROTECTED]>
Cc: Len Brown <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
---

 arch/i386/kernel/nmi.c  |   28 
 arch/x86_64/kernel/nmi.c|   27 +++
 drivers/acpi/namespace/nsinit.c |9 +
 include/linux/nmi.h |9 -
 4 files changed, 72 insertions(+), 1 deletion(-)

Index: linux/arch/i386/kernel/nmi.c
===
--- linux.orig/arch/i386/kernel/nmi.c
+++ linux/arch/i386/kernel/nmi.c
@@ -383,6 +383,34 @@ void enable_timer_nmi_watchdog(void)
}
 }
 
+static void __acpi_nmi_disable(void *__unused)
+{
+   apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+   if (atomic_read(_active) && nmi_watchdog == NMI_IO_APIC)
+   on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_enable(void *__unused)
+{
+   apic_write_around(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+   if (atomic_read(_active) && nmi_watchdog == NMI_IO_APIC)
+   on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+
 #ifdef CONFIG_PM
 
 static int nmi_pm_active; /* nmi_active before suspend */
Index: linux/arch/x86_64/kernel/nmi.c
===
--- linux.orig/arch/x86_64/kernel/nmi.c
+++ linux/arch/x86_64/kernel/nmi.c
@@ -368,6 +368,33 @@ void enable_timer_nmi_watchdog(void)
}
 }
 
+static void __acpi_nmi_disable(void *__unused)
+{
+   apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+   if (atomic_read(_active) && nmi_watchdog == NMI_IO_APIC)
+   on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_enable(void *__unused)
+{
+   apic_write(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+   if (atomic_read(_active) && nmi_watchdog == NMI_IO_APIC)
+   on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
 #ifdef CONFIG_PM
 
 static int nmi_pm_active; /* nmi_active before suspend */
Index: linux/drivers/acpi/namespace/nsinit.c
===
--- linux.orig/drivers/acpi/namespace/nsinit.c
+++ linux/drivers/acpi/namespace/nsinit.c
@@ -45,6 +45,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define _COMPONENT  ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsinit")
@@ -534,7 +535,15 @@ acpi_ns_init_one_device(acpi_handle obj_
info->parameter_type = ACPI_PARAM_ARGS;
info->flags = ACPI_IGNORE_RETURN_VALUE;
 
+   /*
+  

[PATCH x86 for review II] [28/39] i386: fix size_or_mask and size_and_mask

2007-02-11 Thread Andi Kleen

From: "Andreas Herrmann" <[EMAIL PROTECTED]>
mtrr: fix size_or_mask and size_and_mask

This fixes two bugs in /proc/mtrr interface:
o If physical address size crosses the 44 bit boundary
  size_or_mask is evaluated wrong.
o size_and_mask limits width of physical base
  address for an MTRR to be less than 44 bits.

TBD: later patch had one more change, but I think that was bogus.
TBD: need to double check

Signed-off-by: Andreas Herrmann <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/i386/kernel/cpu/mtrr/main.c |6 +++---
 arch/i386/kernel/cpu/mtrr/mtrr.h |2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

Index: linux/arch/i386/kernel/cpu/mtrr/main.c
===
--- linux.orig/arch/i386/kernel/cpu/mtrr/main.c
+++ linux/arch/i386/kernel/cpu/mtrr/main.c
@@ -50,7 +50,7 @@ u32 num_var_ranges = 0;
 unsigned int *usage_table;
 static DEFINE_MUTEX(mtrr_mutex);
 
-u32 size_or_mask, size_and_mask;
+u64 size_or_mask, size_and_mask;
 
 static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
 
@@ -662,8 +662,8 @@ void __init mtrr_bp_init(void)
 boot_cpu_data.x86_mask == 0x4))
phys_addr = 36;
 
-   size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1);
-   size_and_mask = ~size_or_mask & 0xfff0;
+   size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 
1);
+   size_and_mask = ~size_or_mask & 0xf0ULL;
} else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
   boot_cpu_data.x86 == 6) {
/* VIA C* family have Intel style MTRRs, but
Index: linux/arch/i386/kernel/cpu/mtrr/mtrr.h
===
--- linux.orig/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ linux/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -84,7 +84,7 @@ void get_mtrr_state(void);
 
 extern void set_mtrr_ops(struct mtrr_ops * ops);
 
-extern u32 size_or_mask, size_and_mask;
+extern u64 size_or_mask, size_and_mask;
 extern struct mtrr_ops * mtrr_if;
 
 #define is_cpu(vnd)(mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [29/39] x86_64: - Ignore long SMI interrupts in clock calibration code - update 1

2007-02-11 Thread Andi Kleen

From: Jack Steiner <[EMAIL PROTECTED]>
Add failsafe mechanism to HPET/TSC clock calibration.

Signed-off-by: Jack Steiner <[EMAIL PROTECTED]>

Updated to include failsafe mechanism & additional community feedback.
Patch built on latest 2.6.20-rc4-mm1 tree.




Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/x86_64/kernel/time.c |9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/kernel/time.c
===
--- linux.orig/arch/x86_64/kernel/time.c
+++ linux/arch/x86_64/kernel/time.c
@@ -657,6 +657,7 @@ core_initcall(cpufreq_tsc);
 
 #define TICK_COUNT 1
 #define TICK_MIN   5000
+#define MAX_READ_RETRIES 5
 
 /*
  * Some platforms take periodic SMI interrupts with 5ms duration. Make sure 
none
@@ -664,13 +665,17 @@ core_initcall(cpufreq_tsc);
  */
 static void __init read_hpet_tsc(int *hpet, int *tsc)
 {
-   int tsc1, tsc2, hpet1;
+   int tsc1, tsc2, hpet1, retries = 0;
+   static int msg;
 
do {
tsc1 = get_cycles_sync();
hpet1 = hpet_readl(HPET_COUNTER);
tsc2 = get_cycles_sync();
-   } while (tsc2 - tsc1 > TICK_MIN);
+   } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES);
+   if (retries >= MAX_READ_RETRIES && !msg++)
+   printk(KERN_WARNING
+  "hpet.c: exceeded max retries to read HPET & TSC\n");
*hpet = hpet1;
*tsc = tsc2;
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [30/39] x86_64: Check return value of putreg in PTRACE_SETREGS

2007-02-11 Thread Andi Kleen

This means if an illegal value is set for the segment registers there
ptrace will error out now with an errno instead of silently ignoring
it.

Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 arch/x86_64/kernel/ptrace.c |8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/kernel/ptrace.c
===
--- linux.orig/arch/x86_64/kernel/ptrace.c
+++ linux/arch/x86_64/kernel/ptrace.c
@@ -536,8 +536,12 @@ long arch_ptrace(struct task_struct *chi
}
ret = 0;
for (ui = 0; ui < sizeof(struct user_regs_struct); ui += 
sizeof(long)) {
-   ret |= __get_user(tmp, (unsigned long __user *) data);
-   putreg(child, ui, tmp);
+   ret = __get_user(tmp, (unsigned long __user *) data);
+   if (ret)
+   break;
+   ret = putreg(child, ui, tmp);
+   if (ret)
+   break;
data += sizeof(long);
}
break;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [21/39] i386: rdmsr_on_cpu, wrmsr_on_cpu

2007-02-11 Thread Andi Kleen

From: Alexey Dobriyan <[EMAIL PROTECTED]>
There was OpenVZ specific bug rendering some cpufreq drivers unusable
on SMP. In short, when cpufreq code thinks it confined itself to
needed cpu by means of set_cpus_allowed() to execute rdmsr, some
"virtual cpu" feature can migrate process to anywhere. This triggers
bugons and does wrong things in general.

This got fixed by introducing rdmsr_on_cpu and wrmsr_on_cpu executing
rdmsr and wrmsr on given physical cpu by means of
smp_call_function_single().

AK: link it into 64bit kernel too because cpufreq drivers use it.

 arch/i386/kernel/cpu/cpufreq/p4-clockmod.c |   30 ++--
 arch/i386/lib/Makefile |2
 arch/i386/kernel/cpu/cpufreq/p4-clockmod.c |   30 ++--
 arch/i386/lib/Makefile |2 
 arch/i386/lib/msr-on-cpu.c |   70 +
 arch/x86_64/lib/Makefile   |4 +
 include/asm-i386/msr.h |3 +
 5 files changed, 84 insertions(+), 25 deletions(-)

Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

Index: linux/arch/i386/lib/Makefile
===
--- linux.orig/arch/i386/lib/Makefile
+++ linux/arch/i386/lib/Makefile
@@ -7,3 +7,5 @@ lib-y = checksum.o delay.o usercopy.o ge
bitops.o semaphore.o
 
 lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
+
+obj-y = msr-on-cpu.o
Index: linux/arch/i386/lib/msr-on-cpu.c
===
--- /dev/null
+++ linux/arch/i386/lib/msr-on-cpu.c
@@ -0,0 +1,70 @@
+#include 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_SMP
+struct msr_info {
+   u32 msr_no;
+   u32 l, h;
+};
+
+static void __rdmsr_on_cpu(void *info)
+{
+   struct msr_info *rv = info;
+
+   rdmsr(rv->msr_no, rv->l, rv->h);
+}
+
+void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+   preempt_disable();
+   if (smp_processor_id() == cpu)
+   rdmsr(msr_no, *l, *h);
+   else {
+   struct msr_info rv;
+
+   rv.msr_no = msr_no;
+   smp_call_function_single(cpu, __rdmsr_on_cpu, , 0, 1);
+   *l = rv.l;
+   *h = rv.h;
+   }
+   preempt_enable();
+}
+
+static void __wrmsr_on_cpu(void *info)
+{
+   struct msr_info *rv = info;
+
+   wrmsr(rv->msr_no, rv->l, rv->h);
+}
+
+void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+   preempt_disable();
+   if (smp_processor_id() == cpu)
+   wrmsr(msr_no, l, h);
+   else {
+   struct msr_info rv;
+
+   rv.msr_no = msr_no;
+   rv.l = l;
+   rv.h = h;
+   smp_call_function_single(cpu, __wrmsr_on_cpu, , 0, 1);
+   }
+   preempt_enable();
+}
+#else
+void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+   rdmsr(msr_no, *l, *h);
+}
+
+void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+   wrmsr(msr_no, l, h);
+}
+#endif
+
+EXPORT_SYMBOL(rdmsr_on_cpu);
+EXPORT_SYMBOL(wrmsr_on_cpu);
Index: linux/include/asm-i386/msr.h
===
--- linux.orig/include/asm-i386/msr.h
+++ linux/include/asm-i386/msr.h
@@ -83,6 +83,9 @@ static inline void wrmsrl (unsigned long
  : "c" (counter))
 #endif /* !CONFIG_PARAVIRT */
 
+void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
+void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+
 /* symbolic names for some interesting MSRs */
 /* Intel defined MSRs. */
 #define MSR_IA32_P5_MC_ADDR0
Index: linux/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
===
--- linux.orig/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
+++ linux/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
@@ -62,7 +62,7 @@ static int cpufreq_p4_setdc(unsigned int
if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == 
DC_RESV))
return -EINVAL;
 
-   rdmsr(MSR_IA32_THERM_STATUS, l, h);
+   rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, , );
 
if (l & 0x01)
dprintk("CPU#%d currently thermal throttled\n", cpu);
@@ -70,10 +70,10 @@ static int cpufreq_p4_setdc(unsigned int
if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == 
DC_DFLT))
newstate = DC_38PT;
 
-   rdmsr(MSR_IA32_THERM_CONTROL, l, h);
+   rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, , );
if (newstate == DC_DISABLE) {
dprintk("CPU#%d disabling modulation\n", cpu);
-   wrmsr(MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
+   wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
} else {
dprintk("CPU#%d setting duty cycle to %d%%\n",
cpu, ((125 * newstate) / 10));
@@ -84,7 +84,7 @@ static int cpufreq_p4_setdc(unsigned int
 */
 

[PATCH x86 for review II] [15/39] x86_64: list x86_64 quilt tree

2007-02-11 Thread Andi Kleen

From: Randy Dunlap <[EMAIL PROTECTED]>

List x86_64 quilt tree in MAINTAINERS.

Signed-off-by: Randy Dunlap <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---
 MAINTAINERS |1 +
 1 file changed, 1 insertion(+)

Index: linux/MAINTAINERS
===
--- linux.orig/MAINTAINERS
+++ linux/MAINTAINERS
@@ -3735,6 +3735,7 @@ P:Andi Kleen
 M: [EMAIL PROTECTED]
 L: [EMAIL PROTECTED]
 W: http://www.x86-64.org
+T: quilt ftp://ftp.firstfloor.org/pub/ak/x86_64/quilt-current
 S: Maintained
 
 YAM DRIVER FOR AX.25
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [23/39] i386: use smp_call_function_single()

2007-02-11 Thread Andi Kleen

From: Alexey Dobriyan <[EMAIL PROTECTED]>
It will execute rdmsr and wrmsr only on the cpu we need.

Signed-off-by: Alexey Dobriyan <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---

 arch/i386/kernel/msr.c |   13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

Index: linux/arch/i386/kernel/msr.c
===
--- linux.orig/arch/i386/kernel/msr.c
+++ linux/arch/i386/kernel/msr.c
@@ -68,7 +68,6 @@ static inline int rdmsr_eio(u32 reg, u32
 #ifdef CONFIG_SMP
 
 struct msr_command {
-   int cpu;
int err;
u32 reg;
u32 data[2];
@@ -78,16 +77,14 @@ static void msr_smp_wrmsr(void *cmd_bloc
 {
struct msr_command *cmd = (struct msr_command *)cmd_block;
 
-   if (cmd->cpu == smp_processor_id())
-   cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
+   cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
 }
 
 static void msr_smp_rdmsr(void *cmd_block)
 {
struct msr_command *cmd = (struct msr_command *)cmd_block;
 
-   if (cmd->cpu == smp_processor_id())
-   cmd->err = rdmsr_eio(cmd->reg, >data[0], >data[1]);
+   cmd->err = rdmsr_eio(cmd->reg, >data[0], >data[1]);
 }
 
 static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
@@ -99,12 +96,11 @@ static inline int do_wrmsr(int cpu, u32 
if (cpu == smp_processor_id()) {
ret = wrmsr_eio(reg, eax, edx);
} else {
-   cmd.cpu = cpu;
cmd.reg = reg;
cmd.data[0] = eax;
cmd.data[1] = edx;
 
-   smp_call_function(msr_smp_wrmsr, , 1, 1);
+   smp_call_function_single(cpu, msr_smp_wrmsr, , 1, 1);
ret = cmd.err;
}
preempt_enable();
@@ -120,10 +116,9 @@ static inline int do_rdmsr(int cpu, u32 
if (cpu == smp_processor_id()) {
ret = rdmsr_eio(reg, eax, edx);
} else {
-   cmd.cpu = cpu;
cmd.reg = reg;
 
-   smp_call_function(msr_smp_rdmsr, , 1, 1);
+   smp_call_function_single(cpu, msr_smp_rdmsr, , 1, 1);
 
*eax = cmd.data[0];
*edx = cmd.data[1];
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [2/39] x86_64: Break init() in two parts to avoid MODPOST warnings

2007-02-11 Thread Andi Kleen

From: Vivek Goyal <[EMAIL PROTECTED]>

o init() is a non __init function in .text section but it calls many
  functions which are in .init.text section. Hence MODPOST generates lots
  of cross reference warnings on i386 if compiled with CONFIG_RELOCATABLE=y

WARNING: vmlinux - Section mismatch: reference to .init.text:smp_prepare_cpus 
from .text between 'init' (at offset 0xc0101049) and 'rest_init'
WARNING: vmlinux - Section mismatch: reference to .init.text:migration_init 
from .text between 'init' (at offset 0xc010104e) and 'rest_init'
WARNING: vmlinux - Section mismatch: reference to .init.text:spawn_ksoftirqd 
from .text between 'init' (at offset 0xc0101053) and 'rest_init'

o This patch breaks down init() in two parts. One part which can go
  in .init.text section and can be freed and other part which has to
  be non __init(init_post()). Now init() calls init_post() and init_post()
  does not call any functions present in .init sections. Hence getting
  rid of warnings.

Signed-off-by: Vivek Goyal <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---

 init/main.c |   81 +---
 1 file changed, 45 insertions(+), 36 deletions(-)

Index: linux/init/main.c
===
--- linux.orig/init/main.c
+++ linux/init/main.c
@@ -713,7 +713,49 @@ static void run_init_process(char *init_
kernel_execve(init_filename, argv_init, envp_init);
 }
 
-static int init(void * unused)
+/* This is a non __init function. Force it to be noinline otherwise gcc
+ * makes it inline to init() and it becomes part of init.text section
+ */
+static int noinline init_post(void)
+{
+   free_initmem();
+   unlock_kernel();
+   mark_rodata_ro();
+   system_state = SYSTEM_RUNNING;
+   numa_default_policy();
+
+   if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
+   printk(KERN_WARNING "Warning: unable to open an initial 
console.\n");
+
+   (void) sys_dup(0);
+   (void) sys_dup(0);
+
+   if (ramdisk_execute_command) {
+   run_init_process(ramdisk_execute_command);
+   printk(KERN_WARNING "Failed to execute %s\n",
+   ramdisk_execute_command);
+   }
+
+   /*
+* We try each of these until one succeeds.
+*
+* The Bourne shell can be used instead of init if we are
+* trying to recover a really broken machine.
+*/
+   if (execute_command) {
+   run_init_process(execute_command);
+   printk(KERN_WARNING "Failed to execute %s.  Attempting "
+   "defaults...\n", execute_command);
+   }
+   run_init_process("/sbin/init");
+   run_init_process("/etc/init");
+   run_init_process("/bin/init");
+   run_init_process("/bin/sh");
+
+   panic("No init found.  Try passing init= option to kernel.");
+}
+
+static int __init init(void * unused)
 {
lock_kernel();
/*
@@ -761,39 +803,6 @@ static int init(void * unused)
 * we're essentially up and running. Get rid of the
 * initmem segments and start the user-mode stuff..
 */
-   free_initmem();
-   unlock_kernel();
-   mark_rodata_ro();
-   system_state = SYSTEM_RUNNING;
-   numa_default_policy();
-
-   if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
-   printk(KERN_WARNING "Warning: unable to open an initial 
console.\n");
-
-   (void) sys_dup(0);
-   (void) sys_dup(0);
-
-   if (ramdisk_execute_command) {
-   run_init_process(ramdisk_execute_command);
-   printk(KERN_WARNING "Failed to execute %s\n",
-   ramdisk_execute_command);
-   }
-
-   /*
-* We try each of these until one succeeds.
-*
-* The Bourne shell can be used instead of init if we are 
-* trying to recover a really broken machine.
-*/
-   if (execute_command) {
-   run_init_process(execute_command);
-   printk(KERN_WARNING "Failed to execute %s.  Attempting "
-   "defaults...\n", execute_command);
-   }
-   run_init_process("/sbin/init");
-   run_init_process("/etc/init");
-   run_init_process("/bin/init");
-   run_init_process("/bin/sh");
-
-   panic("No init found.  Try passing init= option to kernel.");
+   init_post();
+   return 0;
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH x86 for review II] [1/39] i386: move startup_32() in text.head section

2007-02-11 Thread Andi Kleen

From: Vivek Goyal <[EMAIL PROTECTED]>

o Entry startup_32 was in .text section but it was accessing some init
  data too and it prompts MODPOST to generate compilation warnings.

WARNING: vmlinux - Section mismatch: reference to .init.data:boot_params from
.text between '_text' (at offset 0xc0100029) and 'startup_32_smp'
WARNING: vmlinux - Section mismatch: reference to .init.data:boot_params from
.text between '_text' (at offset 0xc0100037) and 'startup_32_smp'
WARNING: vmlinux - Section mismatch: reference to
.init.data:init_pg_tables_end from .text between '_text' (at offset
0xc0100099) and 'startup_32_smp'

o Can't move startup_32 to .init.text as this entry point has to be at the
  start of bzImage. Hence moved startup_32 to a new section .text.head and
  instructed MODPOST to not to generate warnings if init data is being
  accessed from .text.head section. This code has been audited.

o SMP boot up code (startup_32_smp) can go into .init.text if CPU hotplug
  is not supported. Otherwise it generates more warnings

WARNING: vmlinux - Section mismatch: reference to .init.data:new_cpu_data from
.text between 'checkCPUtype' (at offset 0xc0100126) and 'is486'
WARNING: vmlinux - Section mismatch: reference to .init.data:new_cpu_data from
.text between 'checkCPUtype' (at offset 0xc0100130) and 'is486'

Signed-off-by: Vivek Goyal <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>

---

 arch/i386/kernel/head.S|   17 ++---
 arch/i386/kernel/vmlinux.lds.S |7 ++-
 scripts/mod/modpost.c  |   10 +-
 3 files changed, 29 insertions(+), 5 deletions(-)

Index: linux/arch/i386/kernel/head.S
===
--- linux.orig/arch/i386/kernel/head.S
+++ linux/arch/i386/kernel/head.S
@@ -53,6 +53,7 @@
  * any particular GDT layout, because we load our own as soon as we
  * can.
  */
+.section .text.head,"ax",@progbits
 ENTRY(startup_32)
 
 #ifdef CONFIG_PARAVIRT
@@ -141,16 +142,25 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
jb 10b
movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
 
-#ifdef CONFIG_SMP
xorl %ebx,%ebx  /* This is the boot CPU (BSP) */
jmp 3f
-
 /*
  * Non-boot CPU entry point; entered from trampoline.S
  * We can't lgdt here, because lgdt itself uses a data segment, but
  * we know the trampoline has already loaded the boot_gdt_table GDT
  * for us.
+ *
+ * If cpu hotplug is not supported then this code can go in init section
+ * which will be freed later
  */
+
+#ifdef CONFIG_HOTPLUG_CPU
+.section .text,"ax",@progbits
+#else
+.section .init.text,"ax",@progbits
+#endif
+
+#ifdef CONFIG_SMP
 ENTRY(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
@@ -208,8 +218,8 @@ ENTRY(startup_32_smp)
xorl %ebx,%ebx
incl %ebx
 
-3:
 #endif /* CONFIG_SMP */
+3:
 
 /*
  * Enable paging
@@ -492,6 +502,7 @@ ignore_int:
 #endif
iret
 
+.section .text
 #ifdef CONFIG_PARAVIRT
 startup_paravirt:
cld
Index: linux/arch/i386/kernel/vmlinux.lds.S
===
--- linux.orig/arch/i386/kernel/vmlinux.lds.S
+++ linux/arch/i386/kernel/vmlinux.lds.S
@@ -37,9 +37,14 @@ SECTIONS
 {
   . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
   phys_startup_32 = startup_32 - LOAD_OFFSET;
+
+  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+   _text = .;  /* Text and read-only data */
+   *(.text.head)
+  } :text = 0x9090
+
   /* read-only */
   .text : AT(ADDR(.text) - LOAD_OFFSET) {
-   _text = .;  /* Text and read-only data */
*(.text)
SCHED_TEXT
LOCK_TEXT
Index: linux/scripts/mod/modpost.c
===
--- linux.orig/scripts/mod/modpost.c
+++ linux/scripts/mod/modpost.c
@@ -641,12 +641,20 @@ static int secref_whitelist(const char *
if (f1 && f2)
return 1;
 
-   /* Whitelist all references from .pci_fixup section if vmlinux */
+   /* Whitelist all references from .pci_fixup section if vmlinux
+* Whitelist all refereces from .text.head to .init.data if vmlinux
+* Whitelist all refereces from .text.head to .init.text if vmlinux
+*/
if (is_vmlinux(modname)) {
if ((strcmp(fromsec, ".pci_fixup") == 0) &&
(strcmp(tosec, ".init.text") == 0))
return 1;
 
+   if ((strcmp(fromsec, ".text.head") == 0) &&
+   ((strcmp(tosec, ".init.data") == 0) ||
+   (strcmp(tosec, ".init.text") == 0)))
+   return 1;
+
/* Check for pattern 3 */
for (s = pat3refsym; *s; s++)
if (strcmp(refsymname, *s) == 0)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL 

Re: [IA64] swiotlb abstraction (e.g. for Xen)

2007-02-11 Thread Jan Beulich
On Wed, Feb 07, 2007 at 09:32:54AM +0100, Christoph Hellwig wrote:
> On Wed, Feb 07, 2007 at 07:59:18AM +, Linux Kernel Mailing List wrote:
> > [IA64] swiotlb abstraction (e.g. for Xen)
> > 
> > Add abstraction so that the file can be used by environments other than 
> > IA64
> > and EM64T, namely for Xen.
> 
> Tony, this code is more than ugly, and even further not needed for anything
> we actually need.  Can you please revert it.
> 
> Some comments below in case we need justification..
> 
> If Jan actually had a goal with that except making the code utterly
> unreadable he should try again with small patches that are well
> explained and do one thing at a at time.  (And cane be reviewed an
> improved on if needed.

As the topic says - the goal is to support Xen. But yes, I was afraid someone 
would
claim this make the code look ugly. And no, I currently don't have ideas to 
address
any of your comments without breaking functionality on Xen...

Jan
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: CPU load

2007-02-11 Thread Con Kolivas
On Monday 12 February 2007 18:10, malc wrote:
> On Mon, 12 Feb 2007, Con Kolivas wrote:
> > Lots of confusion comes from this, and often people think their pc
> > suddenly uses a lot less cpu when they change from 1000HZ to 100HZ and
> > use this as an argument/reason for changing to 100HZ when in fact the
> > massive _reported_ difference is simply worse accounting. Of course there
> > is more overhead going from 100 to 1000 but it doesn't suddenly make your
> > apps use 10 times more cpu.
>
> Yep. This, i belive, what made the mplayer developers incorrectly conclude
> that utilizing RTC suddenly made the code run slower, after all /proc/stat
> now claims that CPU load is higher, while in reality it stayed the same -
> it's the accuracy that has improved (somewhat)
>
> But back to the original question, does it look at what's running on timer
> interrupt only or any IRQ? (something which is more in line with my own
> observations)

During the timer interrupt only. However if you create any form of timer, they 
will of course have some periodicity relationship with the timer interrupt.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: remote debugging via FireWire

2007-02-11 Thread Benjamin Herrenschmidt
On Mon, 2007-02-12 at 07:49 +0100, Andi Kleen wrote:
> On Sunday 11 February 2007 22:35, Benjamin Herrenschmidt wrote:
> 
> > I'd like to have that on ppc as well, so I'd rather keep it in drivers/
> 
> This will need some abstraction at least -- there are some early mapping hacks
> that are x86 specific right now.

Either abstraction or ifdef's .. we have ioremap working very early on
ppc :-)

> > I agree that it doesn't need to be a module. If you can load modules,
> > then you can load the full ohci driver. Thus, if it's an early thingy
> > initialized by arch, it can export a special "takeover" hook that the
> > proper ohci module can then call to override it (important if we start
> > having an irq handler).
> > 
> > Andi, also, how do you deal with iommu ? Not at all ? :-)
> 
> Yes -- it's really early debugging hack mostly. It's reasonable to 
> let the iommu be disabled (or later a special bypass can be added for this) 

Ok.

Ben.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[QUESTION] file access time in millisecond?

2007-02-11 Thread Jeff Chua

Is it possible to get file access time in millisecond resolution?

stat() returns time in seconds, but gettimeofday() can returns microseconds.


Thanks,
Jeff.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/8] lguest: trivial guest block driver

2007-02-11 Thread Rusty Russell
On Mon, 2007-02-12 at 06:32 +0100, Jens Axboe wrote:
> On Mon, Feb 12 2007, Rusty Russell wrote:
> > On Mon, 2007-02-12 at 05:43 +0100, Jens Axboe wrote:
> > > Here you map the entire request (lets call that segment A..Z), but
> > > end_request() only completes the first chunk of the request. So
> > > elv_next_request() will retrieve the same request again, and you'll then
> > > map B..Z and repeat that transfer. So unless I'm missing some other part
> > > here (just read it over quickly), you are re-doing large parts of a
> > > merged request several times.

virtbench before:
Time to read from disk (256 kB): 18654562 nsec
After:
Time to read from disk (256 kB): 8018468 nsec

Thanks Jens!!
Rusty.
PS.  One day I'll buy you a beer and you can explain your nomenclature
theory for the block subsystem 8)

Name: lguest: trivial guest block driver

A simple block driver for lguest (/dev/lgbX).  Only does one request
at once.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

diff -r a155959c419f drivers/block/Makefile
--- a/drivers/block/MakefileMon Feb 12 14:26:47 2007 +1100
+++ b/drivers/block/MakefileMon Feb 12 14:26:47 2007 +1100
@@ -28,4 +28,5 @@ obj-$(CONFIG_VIODASD) += viodasd.o
 obj-$(CONFIG_VIODASD)  += viodasd.o
 obj-$(CONFIG_BLK_DEV_SX8)  += sx8.o
 obj-$(CONFIG_BLK_DEV_UB)   += ub.o
+obj-$(CONFIG_LGUEST_GUEST) += lguest_blk.o
 
diff -r a155959c419f drivers/block/lguest_blk.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/drivers/block/lguest_blk.cMon Feb 12 18:07:05 2007 +1100
@@ -0,0 +1,270 @@
+/* A simple block driver for lguest.
+ *
+ * Copyright 2006 Rusty Russell <[EMAIL PROTECTED]> IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+//#define DEBUG
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static char next_block_index = 'a';
+
+struct blockdev
+{
+   spinlock_t lock;
+
+   /* The disk structure for the kernel. */
+   struct gendisk *disk;
+
+   /* The major number for this disk. */
+   int major;
+   int irq;
+
+   unsigned long phys_addr;
+   /* The ioremap'ed block page. */
+   struct lguest_block_page *lb_page;
+
+   /* We only have a single request outstanding at a time. */
+   struct lguest_dma dma;
+   struct request *req;
+};
+
+/* Jens gave me this nice helper to end all chunks of a request. */
+static void end_entire_request(struct request *req, int uptodate)
+{
+   if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
+   BUG();
+   add_disk_randomness(req->rq_disk);
+   blkdev_dequeue_request(req);
+   end_that_request_last(req, uptodate);
+}
+
+static irqreturn_t lgb_irq(int irq, void *_bd)
+{
+   struct blockdev *bd = _bd;
+   unsigned long flags;
+
+   if (!bd->req) {
+   pr_debug("No work!\n");
+   return IRQ_NONE;
+   }
+
+   if (!bd->lb_page->result) {
+   pr_debug("No result!\n");
+   return IRQ_NONE;
+   }
+
+   spin_lock_irqsave(>lock, flags);
+   end_entire_request(bd->req, bd->lb_page->result == 1);
+   bd->req = NULL;
+   bd->dma.used_len = 0;
+   blk_start_queue(bd->disk->queue);
+   spin_unlock_irqrestore(>lock, flags);
+   return IRQ_HANDLED;
+}
+
+static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
+{
+   unsigned int i = 0, idx, len = 0;
+   struct bio *bio;
+
+   rq_for_each_bio(bio, req) {
+   struct bio_vec *bvec;
+   bio_for_each_segment(bvec, bio, idx) {
+   BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
+   BUG_ON(!bvec->bv_len);
+   dma->addr[i] = page_to_phys(bvec->bv_page)
+   + bvec->bv_offset;
+   dma->len[i] = bvec->bv_len;
+   len += bvec->bv_len;
+   i++;
+   }
+   }
+   if (i < LGUEST_MAX_DMA_SECTIONS)
+   dma->len[i] = 0;
+   return len;
+}
+
+static void empty_dma(struct lguest_dma *dma)
+{
+   dma->len[0] = 0;
+}
+
+static void setup_req(struct blockdev *bd,
+ int type, struct request *req, struct lguest_dma *dma)
+{
+   bd->lb_page->type = 

Re: [PATCH 1/3] Blackfin: architecture patch against Linux kernel2.6.20 (again)

2007-02-11 Thread Sonic Zhang

Any comments on this blackfin arch kernel patch for 2.6.20?
We fixed a lot of issues according to the feedback against our last
patch for 2.6.18. We really appreciate your comments on this new one.

Thanks
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: CPU load

2007-02-11 Thread malc

On Mon, 12 Feb 2007, Con Kolivas wrote:


On Monday 12 February 2007 16:54, malc wrote:

On Mon, 12 Feb 2007, Con Kolivas wrote:

On 12/02/07, Vassili Karpov <[EMAIL PROTECTED]> wrote:


[..snip..]


The kernel looks at what is using cpu _only_ during the timer
interrupt. Which means if your HZ is 1000 it looks at what is running
at precisely the moment those 1000 timer ticks occur. It is
theoretically possible using this measurement system to use >99% cpu
and record 0 usage if you time your cpu usage properly. It gets even
more inaccurate at lower HZ values for the same reason.


Thank you very much. This somewhat contradicts what i saw (and outlined
in usnet article), namely the mplayer+/dev/rtc case. Unless ofcourse
/dev/rtc interrupt is considered to be the same as the interrupt from
PIT (on X86 that is)

P.S. Perhaps it worth documenting this in the documentation? I caused
  me, and perhaps quite a few other people, a great deal of pain and
  frustration.


Lots of confusion comes from this, and often people think their pc suddenly
uses a lot less cpu when they change from 1000HZ to 100HZ and use this as an
argument/reason for changing to 100HZ when in fact the massive _reported_
difference is simply worse accounting. Of course there is more overhead going
from 100 to 1000 but it doesn't suddenly make your apps use 10 times more
cpu.


Yep. This, i belive, what made the mplayer developers incorrectly conclude
that utilizing RTC suddenly made the code run slower, after all /proc/stat
now claims that CPU load is higher, while in reality it stayed the same -
it's the accuracy that has improved (somewhat)

But back to the original question, does it look at what's running on timer
interrupt only or any IRQ? (something which is more in line with my own
observations)

--
vale
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Documenting MS_RELATIME

2007-02-11 Thread Valerie Henson
On Sat, Feb 10, 2007 at 07:54:00PM -0500, Dave Jones wrote:
> 
> Whilst on the subject of RELATIME, is there any good reason why
> not to make this a default mount option ?

Ubuntu has been shipping with noatime as the default for some time
now, with no obvious problems (I'm running Ubuntu).  I see relatime as
an improvement on noatime.

-VAL
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: CPU load

2007-02-11 Thread malc

On Mon, 12 Feb 2007, Con Kolivas wrote:


On 12/02/07, Vassili Karpov <[EMAIL PROTECTED]> wrote:


[..snip..]


The kernel looks at what is using cpu _only_ during the timer
interrupt. Which means if your HZ is 1000 it looks at what is running
at precisely the moment those 1000 timer ticks occur. It is
theoretically possible using this measurement system to use >99% cpu
and record 0 usage if you time your cpu usage properly. It gets even
more inaccurate at lower HZ values for the same reason.


Thank you very much. This somewhat contradicts what i saw (and outlined
in usnet article), namely the mplayer+/dev/rtc case. Unless ofcourse
/dev/rtc interrupt is considered to be the same as the interrupt from
PIT (on X86 that is)

P.S. Perhaps it worth documenting this in the documentation? I caused
 me, and perhaps quite a few other people, a great deal of pain and
 frustration.

--
vale
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Documenting MS_RELATIME

2007-02-11 Thread Valerie Henson
On Sat, Feb 10, 2007 at 09:56:07AM -0800, Michael Kerrisk wrote:
> Val,
> 
> I'm just updating the mount(2) man page for MS_RELATIME, and this is the
> text I've come up with:
> 
>MS_RELATIME(Since Linux 2.6.20)
>   When a file on this file system is accessed, only
>   update  the  file's last accessed time (atime) if
>   the current value of atime is less than or  equal
>   to  the file's last modified (mtime) or last sta-
>   tus change time (ctime).  This option  is  useful
>   for  programs, such as mutt(1), that need to know
>   when a file has been read since it was last modi-
>   fied.
> 
> This text is based on your comments accompanying the various patches, but
> it differs in a respect.  Your comments said that the atime would only be
> updated if the atime is older than mtime/ctime.  However, what the code
> actually does is update atime if it is is <= mtime/ctime -- i.e., atime is
> older than or *or equal to* mtime/ctime.
> 
> I'm sure that the code implements your intention, but before incorporating
> the above text I thought I just better check, since the code differs from
> your comment.  Can you just confirm that the proposed man page text is okay.

That's correct, yes.  Thanks!

-VAL
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: remote debugging via FireWire

2007-02-11 Thread Andi Kleen
On Sunday 11 February 2007 22:35, Benjamin Herrenschmidt wrote:

> I'd like to have that on ppc as well, so I'd rather keep it in drivers/

This will need some abstraction at least -- there are some early mapping hacks
that are x86 specific right now.

> I agree that it doesn't need to be a module. If you can load modules,
> then you can load the full ohci driver. Thus, if it's an early thingy
> initialized by arch, it can export a special "takeover" hook that the
> proper ohci module can then call to override it (important if we start
> having an irq handler).
> 
> Andi, also, how do you deal with iommu ? Not at all ? :-)

Yes -- it's really early debugging hack mostly. It's reasonable to 
let the iommu be disabled (or later a special bypass can be added for this) 

-Andi
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: High CPU usage with sata_nv

2007-02-11 Thread Tejun Heo

ris wrote:

procs ---memory-- ---swap-- -io -system-- cpu
 r  b   swpd   free   buff  cache   si   sobibo   in   cs us sy id wa
 0  0  0 303444  53224 36013200   276   157  627  814  5  2 89  4
 0  0  0 302956  53228 36033200   196 0 1193 1686  2  2 95  1
 0  0  0 303204  53228 36033200 0 0 1175 1544  2  1 97  0
 1  0  0 234656  53240 42888000 34428 0 1498 2532  4 11 69 16
 2  0  0 105776  53248 55637200 63752 0 1729 2696  6 18 47 29
 0  3  0   9464  53248 64870800 43780 28008 1804 2262  6 20 29 44
 0  3   2588   9548  51548 64748800 19200 47616 1503 1903  4  7 42 46
 0  3   2640  10152  50840 64848000 24716 11876 1695 2251  5  8  7 80
 1  2   2640   9788  46468 65205200 54280  5620 1740 2687  6 18  0 76
 0  3   2640   9788  46308 65250000 43392  2212 1626 2038  5 14  0 81
 1  2   2640   9636  46308 65300000 38528  1164 1588 2181  4 12  0 84
 0  3   2640  13872  46084 64826400 30088 12972 1701 2223  4 12  0 85
 0  4   2644   8460  37140 6615720 2640 25648 26372 1615 1908  3 10  0 88
 0  3   2644   9188  16760 68319600 33840 48988 1639 2154  4 12  0 84


iowait != cpu busy.  Your cpu idleness stays above 80%.

--
tejun
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] Re: [autofs] Bad race condition in the new autofs protocol somewhere

2007-02-11 Thread Ian Kent
On Mon, 2007-02-12 at 15:43 +0900, Ian Kent wrote:
> On Thu, 2007-02-08 at 11:33 +0900, Ian Kent wrote:
> > On Wed, 2007-02-07 at 19:18 +0100, Olivier Galibert wrote:
> > > On Thu, Feb 08, 2007 at 03:07:41AM +0900, Ian Kent wrote:
> > > > It may be better to update to a later kernel so I don't have to port the
> > > > patch to several different kernels. Is that possible?
> > > 
> > > Sure, 2.6.20 or -git?
> > 
> > 2.6.20 has all the patches I've proposed so far except for the one we're
> > working on so that would be best for me.
> > 
> > Seems there may still be a problem with the patch so I'll let you know
> > what's happening as soon as I can.
> 
> I think I'm just about done.
> 
> Could you try using the two patches here against 2.6.20 please:
> 
> Ian

---

--- linux-2.6.20/fs/autofs4/root.c.lookup-check-unhased 2007-02-12 
13:49:46.0 +0900
+++ linux-2.6.20/fs/autofs4/root.c  2007-02-12 13:54:58.0 +0900
@@ -655,14 +655,29 @@ static struct dentry *autofs4_lookup(str
 
/*
 * If this dentry is unhashed, then we shouldn't honour this
-* lookup even if the dentry is positive.  Returning ENOENT here
-* doesn't do the right thing for all system calls, but it should
-* be OK for the operations we permit from an autofs.
+* lookup.  Returning ENOENT here doesn't do the right thing
+* for all system calls, but it should be OK for the operations
+* we permit from an autofs.
 */
if (dentry->d_inode && d_unhashed(dentry)) {
+   /*
+* A user space application can (and has done in the past)
+* remove and re-create this directory during the callback.
+* This can leave us with an unhashed dentry, but a
+* successful mount!  So we need to perform another
+* cached lookup in case the dentry now exists.
+*/
+   struct dentry *parent = dentry->d_parent;
+   struct dentry *new = d_lookup(parent, >d_name);
+   if (new != NULL)
+   dentry = new;
+   else
+   dentry = ERR_PTR(-ENOENT);
+
if (unhashed)
dput(unhashed);
-   return ERR_PTR(-ENOENT);
+
+   return dentry;
}
 
if (unhashed)


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.20-rc6 libata PATA ATAPI CDROM is not working

2007-02-11 Thread Tejun Heo

Please test the attached patch over 2.6.20.

Thanks.

--
tejun
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 667acd2..d6fcf0a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1478,7 +1478,16 @@ int ata_dev_read_id(struct ata_device *dev, unsigned int *p_class,
 	}
 
 	tf.protocol = ATA_PROT_PIO;
-	tf.flags |= ATA_TFLAG_POLLING; /* for polling presence detection */
+
+	/* Some devices choke if TF registers contain garbage.  Make
+	 * sure those are properly initialized.
+	 */
+	tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
+
+	/* Device presence detection is unreliable on some
+	 * controllers.  Always poll IDENTIFY if available.
+	 */
+	tf.flags |= ATA_TFLAG_POLLING;
 
 	err_mask = ata_exec_internal(dev, , NULL, DMA_FROM_DEVICE,
  id, sizeof(id[0]) * ATA_ID_WORDS);


Re: AHCI - remove probing of ata2

2007-02-11 Thread Tejun Heo

Hello, Paul.

Paul Rolland wrote:
This looks like the problems that hopefully the patches 

from Tejun and

from Mark Lord cured (the delay after reset and the task file clear)

Any chance I can find this patch(es) and try them ?
 

Also, I've seen a :

ata1: Spurious SDB FIS during NCQ issue=0x0 SAct=0x7ff8001f
FIS=004040a1:0004

What's this ? Is it really bad or just a warning ???

It's one Tejun stuck logging in for in order to investigate further.

OK, will wait a little bit ;)


You have a Maxtor connected to that port, right?  That's caused by 
firmware bug.  Future kernels will consider that condition as protocol 
violation and blacklist the drive such that NCQ is not used on it.  I 
think it can be dangerous on rare corner/error cases but there hasn't 
been any actual report of problems caused by that, so no need to worry 
about it too much.  If it bothers you, you can turn NCQ off using sysfs 
node.  Take a look at FAQ section of http://linux-ata.org



In the meantime, I've been trying 2.6.20-rc7.
Things are not better, except that the ata2 probing failing results in
no /dev/sdd being allocated, thus it changes all the naming afterwards.


Slightly OT, but it's generally a good idea to use LABELs to access 
filesystems.  With so many ports and EH dynamics including hotplug, it's 
much more difficult to keep device names persistent and remember them.



So, 2.6.20-rc7 results are :
 - ata2 probing still very long and delaying boot a lot,


That's sil4726's config device acting weird.  When reset as a normal 
device (not PMP device), sil4726 acts as a normal ATA device which is 
used to configure how the chip behaves.  Unfortunately, the emulation 
seems to contain some flaws and cause lots of delays as you've seen. 
I'm attaching a patch to clear TF before IDENTIFY which fixes similar 
IDENTIFY failures on buggy ATAPI devices.



 - Jmicron PATA still not working, and doesn't even detect the DVD-ROM on
   the port.


This problem is likely to be fixed by clearing TF before IDENTIFY.

So, please test the attached patch and report the result.

--
tejun
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 667acd2..d6fcf0a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1478,7 +1478,16 @@ int ata_dev_read_id(struct ata_device *dev, unsigned int *p_class,
 	}
 
 	tf.protocol = ATA_PROT_PIO;
-	tf.flags |= ATA_TFLAG_POLLING; /* for polling presence detection */
+
+	/* Some devices choke if TF registers contain garbage.  Make
+	 * sure those are properly initialized.
+	 */
+	tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
+
+	/* Device presence detection is unreliable on some
+	 * controllers.  Always poll IDENTIFY if available.
+	 */
+	tf.flags |= ATA_TFLAG_POLLING;
 
 	err_mask = ata_exec_internal(dev, , NULL, DMA_FROM_DEVICE,
  id, sizeof(id[0]) * ATA_ID_WORDS);


[PATCH 1/2] Re: [autofs] Bad race condition in the new autofs protocol somewhere

2007-02-11 Thread Ian Kent
On Thu, 2007-02-08 at 11:33 +0900, Ian Kent wrote:
> On Wed, 2007-02-07 at 19:18 +0100, Olivier Galibert wrote:
> > On Thu, Feb 08, 2007 at 03:07:41AM +0900, Ian Kent wrote:
> > > It may be better to update to a later kernel so I don't have to port the
> > > patch to several different kernels. Is that possible?
> > 
> > Sure, 2.6.20 or -git?
> 
> 2.6.20 has all the patches I've proposed so far except for the one we're
> working on so that would be best for me.
> 
> Seems there may still be a problem with the patch so I'll let you know
> what's happening as soon as I can.

I think I'm just about done.

Could you try using the two patches here against 2.6.20 please:

Ian

---

--- linux-2.6.20/fs/autofs4/autofs_i.h.lookup-expire-race   2007-02-05 
03:44:54.0 +0900
+++ linux-2.6.20/fs/autofs4/autofs_i.h  2007-02-12 12:15:17.0 +0900
@@ -52,6 +52,8 @@ struct autofs_info {
 
int flags;
 
+   struct list_head rehash;
+
struct autofs_sb_info *sbi;
unsigned long last_used;
atomic_t count;
@@ -110,6 +112,8 @@ struct autofs_sb_info {
struct mutex wq_mutex;
spinlock_t fs_lock;
struct autofs_wait_queue *queues; /* Wait queue pointer */
+   spinlock_t rehash_lock;
+   struct list_head rehash_list;
 };
 
 static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
--- linux-2.6.20/fs/autofs4/root.c.lookup-expire-race   2007-02-05 
03:44:54.0 +0900
+++ linux-2.6.20/fs/autofs4/root.c  2007-02-12 12:14:51.0 +0900
@@ -263,7 +263,7 @@ static int try_to_fill_dentry(struct den
 */
status = d_invalidate(dentry);
if (status != -EBUSY)
-   return -ENOENT;
+   return -EAGAIN;
}
 
DPRINTK("dentry=%p %.*s ino=%p",
@@ -413,7 +413,16 @@ static int autofs4_revalidate(struct den
 */
status = try_to_fill_dentry(dentry, flags);
if (status == 0)
-   return 1;
+   return 1;
+
+   /*
+* A status of EAGAIN here means that the dentry has gone
+* away while waiting for an expire to complete. If we are
+* racing with expire lookup will wait for it so this must
+* be a revalidate and we need to send it to lookup.
+*/
+   if (status == -EAGAIN)
+   return 0;
 
return status;
}
@@ -459,9 +468,18 @@ void autofs4_dentry_release(struct dentr
de->d_fsdata = NULL;
 
if (inf) {
+   struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
+
inf->dentry = NULL;
inf->inode = NULL;
 
+   if (sbi) {
+   spin_lock(>rehash_lock);
+   if (!list_empty(>rehash))
+   list_del(>rehash);
+   spin_unlock(>rehash_lock);
+   }
+
autofs4_free_ino(inf);
}
 }
@@ -478,10 +496,80 @@ static struct dentry_operations autofs4_
.d_release  = autofs4_dentry_release,
 };
 
+static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, 
struct dentry *parent, struct qstr *name)
+{
+   unsigned int len = name->len;
+   unsigned int hash = name->hash;
+   const unsigned char *str = name->name;
+   struct list_head *p, *head;
+
+   spin_lock(_lock);
+   spin_lock(>rehash_lock);
+   head = >rehash_list;
+   list_for_each(p, head) {
+   struct autofs_info *ino;
+   struct dentry *dentry;
+   struct qstr *qstr;
+
+   ino = list_entry(p, struct autofs_info, rehash);
+   dentry = ino->dentry;
+
+   spin_lock(>d_lock);
+
+   /* Bad luck, we've already been dentry_iput */
+   if (!dentry->d_inode)
+   goto next;
+
+   qstr = >d_name;
+
+   if (dentry->d_name.hash != hash)
+   goto next;
+   if (dentry->d_parent != parent)
+   goto next;
+
+   if (qstr->len != len)
+   goto next;
+   if (memcmp(qstr->name, str, len))
+   goto next;
+
+   if (d_unhashed(dentry)) {
+   struct autofs_info *ino = autofs4_dentry_ino(dentry);
+   struct inode *inode = dentry->d_inode;
+
+   list_del_init(>rehash);
+   dget(dentry);
+   /*
+* Make the rehashed dentry negative so the VFS
+* behaves as it should.
+*/
+   if (inode) {
+   dentry->d_inode = NULL;
+   list_del_init(>d_alias);
+ 

Re: forcedeth problems on 2.6.20-rc6-mm3

2007-02-11 Thread Tobias Diedrich
Jeff Garzik wrote:
> Tobias Diedrich wrote:
> >Tobias Diedrich wrote:
> >>Ayaz Abdulla wrote:
> >>>For all those who are having issues, please try out the attached patch.
> >>Will try.
> >
> >Does not apply cleanly against 2.6.20, is this one fixed up right?
> 
> It probably needs to be top of 2.6.20-git-latest or 2.6.20-rc6-mm3.
> 
> IOW, the forcedeth changes in question are not in 2.6.20, and you need 
> to apply the patch on top of the latest batch of forcedeth changes.

Well, it hasn't blown up on me despite being applied to 2.6.20...
The problem I was seeing might even be fixed in 2.6.20 vanilla,
since the last version I saw it in was 2.6.20-rc6 and then I
reverted to 2.6.19 to make sure that one is ok (see
[EMAIL PROTECTED]).

-- 
Tobias  PGP: http://9ac7e0bc.uguu.de
このメールは十割再利用されたビットで作られています。
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] QRCU fastpath optimization

2007-02-11 Thread Jens Axboe
On Sun, Feb 11 2007, Paul E. McKenney wrote:
> This patch optimizes the "quick" RCU update-side fastpath, so that in the
> absence of readers, synchronize_qrcu() does four non-atomic comparisons
> and three memory barriers, eliminating the need to acquire the global
> lock in this case.  Lightly tested.  Algorithm has been validated for
> the 3-reader-2-updater and 2-reader-3-updater cases -- 3-readers-3-updaters
> case still to be done (I expect to get access to a large-memory machine
> in the next few weeks -- need >>20GB).
> 
> Not for inclusion.  Patch is against Oleg's original patch, and likely
> needs to be rediffed against Jen's patchstack.  I will do this rediffing
> later, first want an easy-to-test and easy-to-inpect version.

I'd suggest just merging this optimization into the original QRCU patch.
Once you are happy with the validation, I'll add it to the plug branch
as well.

Version against the plug branch below.

diff --git a/kernel/srcu.c b/kernel/srcu.c
index 53c6989..bfe347a 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -324,28 +324,53 @@ void synchronize_qrcu(struct qrcu_struct *qp)
 {
int idx;
 
+   smp_mb();  /* Force preceding change to happen before fastpath check. */
+
/*
-* The following memory barrier is needed to ensure that
-* any prior data-structure manipulation is seen by other
-* CPUs to happen before picking up the value of
-* qp->completed.
+* Fastpath: If the two counters sum to "1" at a given point in
+* time, there are no readers.  However, it takes two separate
+* loads to sample both counters, which won't occur simultaneously.
+* So we might race with a counter switch, so that we might see
+* ctr[0]==0, then the counter might switch, then we might see
+* ctr[1]==1 (unbeknownst to us because there is a reader still
+* there).  So we do a read memory barrier and recheck.  If the
+* same race happens again, there must have been a second counter
+* switch.  This second counter switch could not have happened
+* until all preceding readers finished, so if the condition
+* is true both times, we may safely proceed.
+*
+* This relies critically on the atomic increment and atomic
+* decrement being seen as executing in order.
 */
-   smp_mb();
+
+   if (atomic_read(>ctr[0]) + atomic_read(>ctr[1]) <= 1) {
+   smp_rmb();  /* Keep two checks independent. */
+   if (atomic_read(>ctr[0]) + atomic_read(>ctr[1]) <= 1)
+   goto out;
+   }
+
mutex_lock(>mutex);
 
idx = qp->completed & 0x1;
if (atomic_read(qp->ctr + idx) == 1)
-   goto out;
+   goto out_unlock;
 
atomic_inc(qp->ctr + (idx ^ 0x1));
-   /* Reduce the likelihood that qrcu_read_lock() will loop */
+
+   /*
+* Prevent subsequent decrement from being seen before previous
+* increment -- such an inversion could cause the fastpath
+* above to falsely conclude that there were no readers.  Also,
+* reduce the likelihood that qrcu_read_lock() will loop.
+*/
smp_mb__after_atomic_inc();
qp->completed++;
 
atomic_dec(qp->ctr + idx);
__wait_event(qp->wq, !atomic_read(qp->ctr + idx));
-out:
+out_unlock:
mutex_unlock(>mutex);
+out:
smp_mb();
/*
 * The above smp_mb() is needed in the case that we

-- 
Jens Axboe

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: gigaset build broken on current linux-2.6.git

2007-02-11 Thread Jeff Garzik
On Sun, Feb 11, 2007 at 10:47:27AM -0800, Greg KH wrote:
> It's in my queue and is on track to get in before 2.6.21-rc1 is out.

It breaks the build for everyone, please fast-forward the merging of
this.

Jeff



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux 2.6.16.40

2007-02-11 Thread Bron Gondwana
On Sat, Feb 10, 2007 at 05:41:13PM +0100, Adrian Bunk wrote:
> New drivers since 2.6.16.39:
> - Areca ARC11X0/ARC12X0 SATA-RAID support
> - AMD Athlon64/FX and Opteron temperature sensor

Sorry - I think I just sent a blank reply to this!  Oops.

I was going to say - thanks.  We'll definitely be using
this kernel since we've found that 2.6.19.2 has a
significantly worse IO profile and either the 2.6.16.40-rc1
kernel or 2.6.18 on otherwise identical machines with pretty
stable loads (so we can compare back to previous weeks).

Now to figure out what's causing the extra load in the
2.6.19 branch!

Bron.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: CPU load

2007-02-11 Thread Con Kolivas
On Monday 12 February 2007 16:54, malc wrote:
> On Mon, 12 Feb 2007, Con Kolivas wrote:
> > On 12/02/07, Vassili Karpov <[EMAIL PROTECTED]> wrote:
>
> [..snip..]
>
> > The kernel looks at what is using cpu _only_ during the timer
> > interrupt. Which means if your HZ is 1000 it looks at what is running
> > at precisely the moment those 1000 timer ticks occur. It is
> > theoretically possible using this measurement system to use >99% cpu
> > and record 0 usage if you time your cpu usage properly. It gets even
> > more inaccurate at lower HZ values for the same reason.
>
> Thank you very much. This somewhat contradicts what i saw (and outlined
> in usnet article), namely the mplayer+/dev/rtc case. Unless ofcourse
> /dev/rtc interrupt is considered to be the same as the interrupt from
> PIT (on X86 that is)
>
> P.S. Perhaps it worth documenting this in the documentation? I caused
>   me, and perhaps quite a few other people, a great deal of pain and
>   frustration.

Lots of confusion comes from this, and often people think their pc suddenly 
uses a lot less cpu when they change from 1000HZ to 100HZ and use this as an 
argument/reason for changing to 100HZ when in fact the massive _reported_ 
difference is simply worse accounting. Of course there is more overhead going 
from 100 to 1000 but it doesn't suddenly make your apps use 10 times more 
cpu.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux 2.6.16.40

2007-02-11 Thread Bron Gondwana
On Sat, Feb 10, 2007 at 05:41:13PM +0100, Adrian Bunk wrote:
> New drivers since 2.6.16.39:
> - Areca ARC11X0/ARC12X0 SATA-RAID support
> - AMD Athlon64/FX and Opteron temperature sensor
> 
> 
> Location:
> ftp://ftp.kernel.org/pub/linux/kernel/v2.6/
> 
> git tree:
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-2.6.16.y.git
> 
> 
> Changes since 2.6.16.39:
> 
> Adrian Bunk (2):
>   Linux 2.6.16.40-rc1
>   Linux 2.6.16.40
> 
> Andrew Morton (1):
>   [SCSI] areca sysfs fix
> 
> Bartlomiej Zolnierkiewicz (1):
>   ia64: add pci_get_legacy_ide_irq()
> 
> Erich Chen (1):
>   [SCSI] arcmsr: initial driver, version 1.20.00.13
> 
> James Bottomley (1):
>   [SCSI] arcmsr: fix up sysfs values
> 
> Jeff Garzik (1):
>   [libata] use kmap_atomic(KM_IRQ0) in SCSI simulator
> 
> Neil Brown (1):
>   Make 'repair' actually work for raid1.
> 
> Rudolf Marek (4):
>   hwmon: New driver k8temp
>   k8temp: Add documentation
>   k8temp: Documentation update
>   hwmon: Update Rudolf Marek's e-mail address
> 
> Vladimir Saveliev (1):
>   reiserfs: avoid tail packing if an inode was ever mmapped
> 
> 
>  Documentation/hwmon/k8temp  |   55 +
>  Documentation/scsi/ChangeLog.arcmsr |   56 +
>  Documentation/scsi/arcmsr_spec.txt  |  574 ++
>  MAINTAINERS |6 
>  Makefile|2 
>  drivers/hwmon/Kconfig   |   12 
>  drivers/hwmon/Makefile  |1 
>  drivers/hwmon/hwmon-vid.c   |4 
>  drivers/hwmon/k8temp.c  |  292 +
>  drivers/hwmon/w83792d.c |2 
>  drivers/i2c/busses/i2c-ali1563.c|2 
>  drivers/md/md.c |2 
>  drivers/md/raid1.c  |5 
>  drivers/scsi/Kconfig|   14 
>  drivers/scsi/Makefile   |1 
>  drivers/scsi/arcmsr/Makefile|6 
>  drivers/scsi/arcmsr/arcmsr.h|  472 
>  drivers/scsi/arcmsr/arcmsr_attr.c   |  381 +++
>  drivers/scsi/arcmsr/arcmsr_hba.c| 1496 
>  drivers/scsi/libata-scsi.c  |4 
>  fs/reiserfs/file.c  |   20 
>  fs/reiserfs/inode.c |2 
>  include/asm-ia64/pci.h  |6 
>  include/linux/pci_ids.h |   18 
>  include/linux/reiserfs_fs_i.h   |2 
>  25 files changed, 3426 insertions(+), 9 deletions(-)
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: CPU load

2007-02-11 Thread Con Kolivas
On Monday 12 February 2007 16:55, Stephen Rothwell wrote:
> On Mon, 12 Feb 2007 16:44:22 +1100 "Con Kolivas" <[EMAIL PROTECTED]> wrote:
> > The kernel looks at what is using cpu _only_ during the timer
> > interrupt. Which means if your HZ is 1000 it looks at what is running
> > at precisely the moment those 1000 timer ticks occur. It is
> > theoretically possible using this measurement system to use >99% cpu
> > and record 0 usage if you time your cpu usage properly. It gets even
> > more inaccurate at lower HZ values for the same reason.
>
> That is not true on all architecures, some do more accurate accounting by
> recording the times at user/kernel/interrupt transitions ...

Indeed. It's certainly the way the common more boring pc architectures do it 
though.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: CPU load

2007-02-11 Thread Stephen Rothwell
On Mon, 12 Feb 2007 16:44:22 +1100 "Con Kolivas" <[EMAIL PROTECTED]> wrote:
>
> The kernel looks at what is using cpu _only_ during the timer
> interrupt. Which means if your HZ is 1000 it looks at what is running
> at precisely the moment those 1000 timer ticks occur. It is
> theoretically possible using this measurement system to use >99% cpu
> and record 0 usage if you time your cpu usage properly. It gets even
> more inaccurate at lower HZ values for the same reason.

That is not true on all architecures, some do more accurate accounting by
recording the times at user/kernel/interrupt transitions ...

--
Cheers,
Stephen Rothwell[EMAIL PROTECTED]
http://www.canb.auug.org.au/~sfr/


pgpMZ5w06pmhZ.pgp
Description: PGP signature


[PATCH 004 of 4] knfsd: Allow the server to provide a gid list when using AUTH_UNIX authentication.

2007-02-11 Thread NeilBrown

AUTH_UNIX authentication (the standard with NFS) has a limit of 16
groups ids.  This causes problems for people in more than 16
groups.

So allow the server to map a uid into a list of group ids based on
local knowledge rather depending on the (possibly truncated) list
from the client.

If there is no process on the server responding to upcalls,
the gidlist in the request will still be used.

Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./net/sunrpc/sunrpc_syms.c  |5 
 ./net/sunrpc/svcauth_unix.c |  225 +++-
 2 files changed, 224 insertions(+), 6 deletions(-)

diff .prev/net/sunrpc/sunrpc_syms.c ./net/sunrpc/sunrpc_syms.c
--- .prev/net/sunrpc/sunrpc_syms.c  2007-02-12 16:34:14.0 +1100
+++ ./net/sunrpc/sunrpc_syms.c  2007-02-12 16:41:02.0 +1100
@@ -136,7 +136,7 @@ EXPORT_SYMBOL(nlm_debug);
 
 extern int register_rpc_pipefs(void);
 extern void unregister_rpc_pipefs(void);
-extern struct cache_detail ip_map_cache;
+extern struct cache_detail ip_map_cache, unix_gid_cache;
 extern int init_socket_xprt(void);
 extern void cleanup_socket_xprt(void);
 
@@ -156,6 +156,7 @@ init_sunrpc(void)
rpc_proc_init();
 #endif
cache_register(_map_cache);
+   cache_register(_gid_cache);
init_socket_xprt();
 out:
return err;
@@ -169,6 +170,8 @@ cleanup_sunrpc(void)
rpc_destroy_mempool();
if (cache_unregister(_map_cache))
printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n");
+   if (cache_unregister(_gid_cache))
+ printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n");
 #ifdef RPC_DEBUG
rpc_unregister_sysctl();
 #endif

diff .prev/net/sunrpc/svcauth_unix.c ./net/sunrpc/svcauth_unix.c
--- .prev/net/sunrpc/svcauth_unix.c 2007-02-12 16:34:14.0 +1100
+++ ./net/sunrpc/svcauth_unix.c 2007-02-12 16:41:02.0 +1100
@@ -418,6 +418,214 @@ svcauth_unix_info_release(void *info)
cache_put(>h, _map_cache);
 }
 
+/
+ * auth.unix.gid cache
+ * simple cache to map a UID to a list of GIDs
+ * because AUTH_UNIX aka AUTH_SYS has a max of 16
+ */
+#defineGID_HASHBITS8
+#defineGID_HASHMAX (1flags))
+   put_group_info(ug->gi);
+   kfree(ug);
+}
+
+static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew)
+{
+   struct unix_gid *orig = container_of(corig, struct unix_gid, h);
+   struct unix_gid *new = container_of(cnew, struct unix_gid, h);
+   return orig->uid == new->uid;
+}
+static void unix_gid_init(struct cache_head *cnew, struct cache_head *citem)
+{
+   struct unix_gid *new = container_of(cnew, struct unix_gid, h);
+   struct unix_gid *item = container_of(citem, struct unix_gid, h);
+   new->uid = item->uid;
+}
+static void unix_gid_update(struct cache_head *cnew, struct cache_head *citem)
+{
+   struct unix_gid *new = container_of(cnew, struct unix_gid, h);
+   struct unix_gid *item = container_of(citem, struct unix_gid, h);
+
+   get_group_info(item->gi);
+   new->gi = item->gi;
+}
+static struct cache_head *unix_gid_alloc(void)
+{
+   struct unix_gid *g = kmalloc(sizeof(*g), GFP_KERNEL);
+   if (g)
+   return >h;
+   else
+   return NULL;
+}
+
+static void unix_gid_request(struct cache_detail *cd,
+struct cache_head *h,
+char **bpp, int *blen)
+{
+   char tuid[20];
+   struct unix_gid *ug = container_of(h, struct unix_gid, h);
+
+   snprintf(tuid, 20, "%u", ug->uid);
+   qword_add(bpp, blen, tuid);
+   (*bpp)[-1] = '\n';
+}
+
+static struct unix_gid *unix_gid_lookup(uid_t uid);
+extern struct cache_detail unix_gid_cache;
+
+static int unix_gid_parse(struct cache_detail *cd,
+   char *mesg, int mlen)
+{
+   /* uid expiry Ngid gid0 gid1 ... gidN-1 */
+   int uid;
+   int gids;
+   int rv;
+   int i;
+   int err;
+   time_t expiry;
+   struct unix_gid ug, *ugp;
+
+   if (mlen <= 0 || mesg[mlen-1] != '\n')
+   return -EINVAL;
+   mesg[mlen-1] = 0;
+
+   rv = get_int(, );
+   if (rv)
+   return -EINVAL;
+   ug.uid = uid;
+
+   expiry = get_expiry();
+   if (expiry == 0)
+   return -EINVAL;
+
+   rv = get_int(, );
+   if (rv || gids < 0 || gids > 8192)
+   return -EINVAL;
+
+   ug.gi = groups_alloc(gids);
+   if (!ug.gi)
+   return -ENOMEM;
+
+   for (i = 0 ; i < gids ; i++) {
+   int gid;
+   rv = get_int(, );
+   err = -EINVAL;
+   if (rv)
+   goto out;
+   GROUP_AT(ug.gi, i) = gid;
+   }
+
+   ugp = unix_gid_lookup(uid);
+   if (ugp) {
+ 

[PATCH 000 of 4] knfsd: fixes and enhancements for 2.6.21

2007-02-11 Thread NeilBrown
Following are 4 patchs from knfsd suitable for 2.6.21.

Numbers 3 and 4 provide new usability features that require a new
nfs-utils to make full use of (all nfs-utils will ofcourse continue to
work providing the functionality it always provided).

(3) allows a 16 byte uuid to be used to identify the filesystem rather
than the device id (which is volatile) or a 32bit number (which has to
be managed manually).  nfs-utils used libblkid to extract a uuid from
most filesystems. (Thanks Trond for reviewing this code - I hope you
like the changes).

(4) allows an upcall to be made to map a 'uid' to a list of 'gids'.
The AUTH_UNIX authentication protocol only carries 16 group ids which
causes problems for people with more than 16 group ids.

 [PATCH 001 of 4] knfsd: Fix return value for writes to some files in 'nfsd' 
filesystem.
 [PATCH 002 of 4] knfsd: Tidy up choice of filesystem-identifier when creating 
a filehandle.
 [PATCH 003 of 4] knfsd: Add some new fsid types.
 [PATCH 004 of 4] knfsd: Allow the server to provide a gid list when using 
AUTH_UNIX authentication.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 002 of 4] knfsd: Tidy up choice of filesystem-identifier when creating a filehandle.

2007-02-11 Thread NeilBrown

If we are using the same version/fsid as a current filehandle, then
there is no need to verify the the numbers are valid for this
export, and they must be (we used them to find this export).

This allows us to simplify the fsid selection code.

Also change "ref_fh_version" and "ref_fh_fsid_type" to "version" and
"fsid_type", as the important thing isn't that they are the
version/type of the reference filehandle, but they are the chosen type
for the new filehandle.

And tidy up some indenting.

Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./fs/nfsd/nfsfh.c |  124 ++
 1 file changed, 60 insertions(+), 64 deletions(-)

diff .prev/fs/nfsd/nfsfh.c ./fs/nfsd/nfsfh.c
--- .prev/fs/nfsd/nfsfh.c   2007-02-12 16:39:19.0 +1100
+++ ./fs/nfsd/nfsfh.c   2007-02-12 16:39:36.0 +1100
@@ -211,7 +211,7 @@ fh_verify(struct svc_rqst *rqstp, struct
fileid_type = 2;
} else
fileid_type = fh->fh_fileid_type;
-   
+
if (fileid_type == 0)
dentry = dget(exp->ex_dentry);
else {
@@ -291,7 +291,7 @@ static inline int _fh_update(struct dent
 __u32 *datap, int *maxsize)
 {
struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op;
-   
+
if (dentry == exp->ex_dentry) {
*maxsize = 0;
return 0;
@@ -316,7 +316,8 @@ static inline void _fh_update_old(struct
 }
 
 __be32
-fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, 
struct svc_fh *ref_fh)
+fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
+  struct svc_fh *ref_fh)
 {
/* ref_fh is a reference file handle.
 * if it is non-null and for the same filesystem, then we should compose
@@ -326,8 +327,8 @@ fh_compose(struct svc_fh *fhp, struct sv
 *
 */
 
-   u8 ref_fh_version = 0;
-   u8 ref_fh_fsid_type = 0;
+   u8 version = 1;
+   u8 fsid_type = 0;
struct inode * inode = dentry->d_inode;
struct dentry *parent = dentry->d_parent;
__u32 *datap;
@@ -339,57 +340,52 @@ fh_compose(struct svc_fh *fhp, struct sv
parent->d_name.name, dentry->d_name.name,
(inode ? inode->i_ino : 0));
 
+   /* Choose filehandle version and fsid type based on
+* the reference filehandle (if it is in the same export)
+* or the export options.
+*/
if (ref_fh && ref_fh->fh_export == exp) {
-   ref_fh_version = ref_fh->fh_handle.fh_version;
-   if (ref_fh_version == 0xca)
-   ref_fh_fsid_type = 0;
+   version = ref_fh->fh_handle.fh_version;
+   if (version == 0xca)
+   fsid_type = 0;
else
-   ref_fh_fsid_type = ref_fh->fh_handle.fh_fsid_type;
-   if (ref_fh_fsid_type > 3)
-   ref_fh_fsid_type = 0;
-
-   /* make sure ref_fh type works for given export */
-   if (ref_fh_fsid_type == 1 &&
-   !(exp->ex_flags & NFSEXP_FSID)) {
-   /* if we don't have an fsid, we cannot provide one... */
-   ref_fh_fsid_type = 0;
-   }
+   fsid_type = ref_fh->fh_handle.fh_fsid_type;
+   /* We know this version/type works for this export
+* so there is no need for further checks.
+*/
} else if (exp->ex_flags & NFSEXP_FSID)
-   ref_fh_fsid_type = 1;
-
-   if (!old_valid_dev(ex_dev) && ref_fh_fsid_type == 0) {
+   fsid_type = 1;
+   else if (!old_valid_dev(ex_dev))
/* for newer device numbers, we must use a newer fsid format */
-   ref_fh_version = 1;
-   ref_fh_fsid_type = 3;
-   }
-   if (old_valid_dev(ex_dev) &&
-   (ref_fh_fsid_type == 2 || ref_fh_fsid_type == 3))
-   /* must use type1 for smaller device numbers */
-   ref_fh_fsid_type = 0;
+   fsid_type = 3;
+   else
+   fsid_type = 0;
 
if (ref_fh == fhp)
fh_put(ref_fh);
 
if (fhp->fh_locked || fhp->fh_dentry) {
printk(KERN_ERR "fh_compose: fh %s/%s not initialized!\n",
-   parent->d_name.name, dentry->d_name.name);
+  parent->d_name.name, dentry->d_name.name);
}
if (fhp->fh_maxsize < NFS_FHSIZE)
printk(KERN_ERR "fh_compose: called with maxsize %d! %s/%s\n",
-  fhp->fh_maxsize, parent->d_name.name, 
dentry->d_name.name);
+  fhp->fh_maxsize,
+  parent->d_name.name, dentry->d_name.name);
 
fhp->fh_dentry = dget(dentry); /* our internal copy */

[PATCH 003 of 4] knfsd: Add some new fsid types.

2007-02-11 Thread NeilBrown

Add support for using a filesystem UUID to identify and
export point in the filehandle.
For NFSv2, this UUID is xor-ed down to 4 or 8 bytes so
that it doesn't take up too much room.  For NFSv3+, we
use the full 16 bytes, and possibly also a 64bit inode number
for exports beneath the root of a filesystem.

When generating an fsid to return in 'stat' information,
use the UUID (hashed down to size) if it is available and 
a small 'fsid' was not specifically provided.

Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./fs/nfsd/export.c|  106 +++--
 ./fs/nfsd/nfs3xdr.c   |   31 +---
 ./fs/nfsd/nfs4xdr.c   |   10 +++
 ./fs/nfsd/nfsfh.c |   88 ++
 ./fs/nfsd/nfsxdr.c|   19 ++-
 ./include/linux/nfsd/export.h |7 +-
 ./include/linux/nfsd/nfsd.h   |   12 
 ./include/linux/nfsd/nfsfh.h  |  107 +++---
 8 files changed, 246 insertions(+), 134 deletions(-)

diff .prev/fs/nfsd/export.c ./fs/nfsd/export.c
--- .prev/fs/nfsd/export.c  2007-02-12 16:34:14.0 +1100
+++ ./fs/nfsd/export.c  2007-02-12 16:39:45.0 +1100
@@ -189,18 +189,17 @@ static int expkey_show(struct seq_file *
   struct cache_head *h)
 {
struct svc_expkey *ek ;
+   int i;
 
if (h ==NULL) {
seq_puts(m, "#domain fsidtype fsid [path]\n");
return 0;
}
ek = container_of(h, struct svc_expkey, h);
-   seq_printf(m, "%s %d 0x%08x", ek->ek_client->name,
-  ek->ek_fsidtype, ek->ek_fsid[0]);
-   if (ek->ek_fsidtype != 1)
-   seq_printf(m, "%08x", ek->ek_fsid[1]);
-   if (ek->ek_fsidtype == 2)
-   seq_printf(m, "%08x", ek->ek_fsid[2]);
+   seq_printf(m, "%s %d 0x", ek->ek_client->name,
+  ek->ek_fsidtype);
+   for (i=0; i < key_len(ek->ek_fsidtype)/4; i++)
+   seq_printf(m, "%08x", ek->ek_fsid[i]);
if (test_bit(CACHE_VALID, >flags) && 
!test_bit(CACHE_NEGATIVE, >flags)) {
seq_printf(m, " ");
@@ -231,9 +230,8 @@ static inline void expkey_init(struct ca
kref_get(>ek_client->ref);
new->ek_client = item->ek_client;
new->ek_fsidtype = item->ek_fsidtype;
-   new->ek_fsid[0] = item->ek_fsid[0];
-   new->ek_fsid[1] = item->ek_fsid[1];
-   new->ek_fsid[2] = item->ek_fsid[2];
+
+   memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid));
 }
 
 static inline void expkey_update(struct cache_head *cnew,
@@ -362,7 +360,7 @@ static struct svc_export *svc_export_upd
struct svc_export *old);
 static struct svc_export *svc_export_lookup(struct svc_export *);
 
-static int check_export(struct inode *inode, int flags)
+static int check_export(struct inode *inode, int flags, unsigned char *uuid)
 {
 
/* We currently export only dirs and regular files.
@@ -375,12 +373,13 @@ static int check_export(struct inode *in
/* There are two requirements on a filesystem to be exportable.
 * 1:  We must be able to identify the filesystem from a number.
 *   either a device number (so FS_REQUIRES_DEV needed)
-*   or an FSID number (so NFSEXP_FSID needed).
+*   or an FSID number (so NFSEXP_FSID or ->uuid is needed).
 * 2:  We must be able to find an inode from a filehandle.
 *   This means that s_export_op must be set.
 */
if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
-   !(flags & NFSEXP_FSID)) {
+   !(flags & NFSEXP_FSID) &&
+   uuid == NULL) {
dprintk("exp_export: export of non-dev fs without fsid\n");
return -EINVAL;
}
@@ -405,10 +404,6 @@ fsloc_parse(char **mesg, char *buf, stru
int len;
int migrated, i, err;
 
-   len = qword_get(mesg, buf, PAGE_SIZE);
-   if (len != 5 || memcmp(buf, "fsloc", 5))
-   return 0;
-
/* listsize */
err = get_int(mesg, >locations_count);
if (err)
@@ -519,6 +514,8 @@ static int svc_export_parse(struct cache
exp.ex_fslocs.locations_count = 0;
exp.ex_fslocs.migrated = 0;
 
+   exp.ex_uuid = NULL;
+
/* flags */
err = get_int(, _int);
if (err == -ENOENT)
@@ -542,12 +539,33 @@ static int svc_export_parse(struct cache
if (err) goto out;
exp.ex_fsid = an_int;
 
-   err = check_export(nd.dentry->d_inode, exp.ex_flags);
-   if (err) goto out;
+   while ((len = qword_get(, buf, PAGE_SIZE)) > 0) {
+   if (strcmp(buf, "fsloc") == 0)
+   err = fsloc_parse(, buf, _fslocs);
+   else if (strcmp(buf, "uuid") == 0) {
+   /* expect a 16 byte uuid encoded as 

[PATCH 001 of 4] knfsd: Fix return value for writes to some files in 'nfsd' filesystem.

2007-02-11 Thread NeilBrown

Most files in the 'nfsd' filesystem are transactional.
When you write, a reply is generated that can be read back
only on the same 'file'.
If the reply has zero length, the 'write' will incorrectly 
return a value of '0' instead of the length that was
written.  This causes 'rpc.nfsd' to give an annoying warning.

This patch fixes the test.

Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./fs/nfsd/nfsctl.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff .prev/fs/nfsd/nfsctl.c ./fs/nfsd/nfsctl.c
--- .prev/fs/nfsd/nfsctl.c  2007-02-12 16:39:20.0 +1100
+++ ./fs/nfsd/nfsctl.c  2007-02-12 16:39:33.0 +1100
@@ -123,7 +123,7 @@ static ssize_t nfsctl_transaction_write(
return PTR_ERR(data);
 
rv =  write_op[ino](file, data, size);
-   if (rv>0) {
+   if (rv >= 0) {
simple_transaction_set(file, rv);
rv = size;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: libata FUA revisited

2007-02-11 Thread Robert Hancock

Robert Hancock wrote:

Given the above, what I'm proposing to do is:

-Remove the blacklisting of Maxtor BANC1G10 firmware for FUA. If we need 
to FUA-blacklist any drives this should likely be added to the existing 
"horkage" mechanism we now have. However, at this point I don't think 
that's needed, considering that I've seen no conclusive evidence that 
any drive has ever been established to have broken FUA.


-Add a new port flag ATA_FLAG_NO_FUA to indicate that a controller can't 
handle FUA commands, and add that flag to sata_sil. Force FUA off on any 
drive connected to a controller with this bit set.


There was some talk that sata_mv might have this problem, but I believe 
the conclusion was that it didn't. The only controllers that would are 
ones that actually try to interpret the ATA command codes and don't know 
about WRITE DMA FUA.


-Change the fua module option to control FUA enable/disable to have a 
third value, "enable for NCQ-supporting drives only", which would become 
the new default. That case seems less likely to cause problems since FUA 
on NCQ is just another bit in the command whereas FUA on non-NCQ is an 
entirely different, potentially unsupported command.


OK, here's what I've got to implement the above, and a few other things -
not submitted for inclusion yet as I'd like to get a few comments.

This centralizes the logic in one place for deciding whether to use FUA
or not. It also modifies the logic to account for the fact that when
NCQ is enabled we should always be able to use FUA, since it's inherent
in the definition of the NCQ commands. Since enabling and disabling NCQ
can thus also enable/disable FUA (if the drive doesn't support non-NCQ
FUA) we need to revalidate the device when doing this on change_queue_depth
so that the SCSI layer sees the change.

(I tried to test this, but wasn't able to actually change the queue depth
using the /sys/block/sda/device/queue_depth file. The queue_depth attribute
started out as r--r--r--, I tried chmod u+w and writing it but just got an
"Input/output error". Did somebody break or disable this functionality?)

Also, as well as setting ATA_FLAG_NO_FUA in sata_sil it appears that
pata_it821x also needs FUA disabled when in smart mode as the firmware can't
handle that command.

diff -rup linux-2.6.20-git6/drivers/ata/libata-core.c 
linux-2.6.20-git6edit/drivers/ata/libata-core.c
--- linux-2.6.20-git6/drivers/ata/libata-core.c 2007-02-11 17:31:19.0 
-0600
+++ linux-2.6.20-git6edit/drivers/ata/libata-core.c 2007-02-11 
21:43:11.0 -0600
@@ -85,9 +85,9 @@ int atapi_dmadir = 0;
module_param(atapi_dmadir, int, 0444);
MODULE_PARM_DESC(atapi_dmadir, "Enable ATAPI DMADIR bridge support (0=off, 
1=on)");

-int libata_fua = 0;
+int libata_fua = 1;
module_param_named(fua, libata_fua, int, 0444);
-MODULE_PARM_DESC(fua, "FUA support (0=off, 1=on)");
+MODULE_PARM_DESC(fua, "FUA support (0=off, 1=on for NCQ drives only, 2=on)");

static int ata_probe_timeout = ATA_TMOUT_INTERNAL / HZ;
module_param(ata_probe_timeout, int, 0444);
diff -rup linux-2.6.20-git6/drivers/ata/libata-scsi.c 
linux-2.6.20-git6edit/drivers/ata/libata-scsi.c
--- linux-2.6.20-git6/drivers/ata/libata-scsi.c 2007-02-11 17:31:19.0 
-0600
+++ linux-2.6.20-git6edit/drivers/ata/libata-scsi.c 2007-02-11 
23:07:35.0 -0600
@@ -1002,6 +1002,16 @@ int ata_scsi_change_queue_depth(struct s

scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, queue_depth);

+   /* Note: NCQ is switched off if queue depth is set to 1.
+  Thus changing the depth may also enable/disable FUA,
+  which the SCSI layer needs to know about, so we trigger
+  a revalidate. */
+   if((queue_depth == 1 && !(dev->flags & ATA_DFLAG_NCQ_OFF)) ||
+  (queue_depth > 1 && (dev->flags & ATA_DFLAG_NCQ_OFF))) {
+   ap->eh_info.action |= ATA_EH_REVALIDATE;
+   ata_port_schedule_eh(ap);
+   }
+
spin_lock_irqsave(ap->lock, flags);
if (queue_depth > 1)
dev->flags &= ~ATA_DFLAG_NCQ_OFF;
@@ -1990,27 +2000,46 @@ static unsigned int ata_msense_rw_recove
}

/*
- * We can turn this into a real blacklist if it's needed, for now just
- * blacklist any Maxtor BANC1G10 revision firmware
+ * ata_dev_supports_fua - Determine if this device supports FUA.
+ * @dev: Device to check
+ *
+ * Determine if this device supports FUA based on drive and
+ * controller capabilities.
+ *
+ * LOCKING:
+ * None.
 */
-static int ata_dev_supports_fua(u16 *id)
+static int ata_dev_supports_fua(struct ata_device* dev)
{
-   unsigned char model[ATA_ID_PROD_LEN + 1], fw[ATA_ID_FW_REV_LEN + 1];
-
+   /* Is FUA completely disabled? */
if (!libata_fua)
return 0;
-   if (!ata_id_has_fua(id))
+   
+   /* Does the drive support FUA?
+  NCQ-enabled drives always support FUA, otherwise
+  check if the drive indicates support for FUA commands. */
+  

Re: CPU load

2007-02-11 Thread Con Kolivas

On 12/02/07, Vassili Karpov <[EMAIL PROTECTED]> wrote:

Hello,

How does the kernel calculates the value it places in `/proc/stat' at
4th position (i.e. "idle: twiddling thumbs")?

For background information as to why this question arose in the first
place read on.

While writing the code dealing with video acquisition/processing at
work noticed that what top(1) (and every other tool that uses
`/proc/stat' or `/proc/uptime') shows some very strange results.

Top claimed that the system running one version of the code[A] is
idling more often than the code[B] doing the same thing but more
cleverly. After some head scratching one of my colleagues suggested a
simple test that was implemented in a few minutes.

The test consisted of a counter that incremented in an endless loop
also after certain period of time had elapsed it printed the value of
the counter.  Running this test (with priority set to the lowest
possible level) with code[A] and code[B] confirmed that code[B] is
indeed faster than code[A], in a sense that the test made more forward
progress while code[B] is running.

Hard-coding some things (i.e. the value of the counter after counting
for the duration of one period on completely idle system) we extended
the test to show the percentage of CPU that was utilized. This never
matched the value that top presented us with.

Later small kernel module was developed that tried to time how much
time is spent in the idle handler inside the kernel and exported this
information to the user-space. The results were consistent with our
expectations and the output of the test utility.

Two more points.

a. In the past (again video processing context) i have witnessed
   `/proc/stat' claiming that CPU utilization is 0% for, say, 20
   seconds followed by 5 seconds of 30% load, and then the cycle
   repeated. According to the methods outlined above the load is
   always at 30%.

b. In my personal experience difference between `/proc/stat' and
   "reality" can easily reach 40% (think i saw even more than that)

The module and graphical application that uses it, along with some
short README and a link to Usenet article dealing with the same
subject is available at:
http://www.boblycat.org/~malc/apc


The kernel looks at what is using cpu _only_ during the timer
interrupt. Which means if your HZ is 1000 it looks at what is running
at precisely the moment those 1000 timer ticks occur. It is
theoretically possible using this measurement system to use >99% cpu
and record 0 usage if you time your cpu usage properly. It gets even
more inaccurate at lower HZ values for the same reason.

--
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


CPU load

2007-02-11 Thread Vassili Karpov
Hello,

How does the kernel calculates the value it places in `/proc/stat' at
4th position (i.e. "idle: twiddling thumbs")?

For background information as to why this question arose in the first
place read on.

While writing the code dealing with video acquisition/processing at
work noticed that what top(1) (and every other tool that uses
`/proc/stat' or `/proc/uptime') shows some very strange results.

Top claimed that the system running one version of the code[A] is
idling more often than the code[B] doing the same thing but more
cleverly. After some head scratching one of my colleagues suggested a
simple test that was implemented in a few minutes.

The test consisted of a counter that incremented in an endless loop
also after certain period of time had elapsed it printed the value of
the counter.  Running this test (with priority set to the lowest
possible level) with code[A] and code[B] confirmed that code[B] is
indeed faster than code[A], in a sense that the test made more forward
progress while code[B] is running.

Hard-coding some things (i.e. the value of the counter after counting
for the duration of one period on completely idle system) we extended
the test to show the percentage of CPU that was utilized. This never
matched the value that top presented us with.

Later small kernel module was developed that tried to time how much
time is spent in the idle handler inside the kernel and exported this
information to the user-space. The results were consistent with our
expectations and the output of the test utility.

Two more points.

a. In the past (again video processing context) i have witnessed
   `/proc/stat' claiming that CPU utilization is 0% for, say, 20
   seconds followed by 5 seconds of 30% load, and then the cycle
   repeated. According to the methods outlined above the load is
   always at 30%.

b. In my personal experience difference between `/proc/stat' and
   "reality" can easily reach 40% (think i saw even more than that)

The module and graphical application that uses it, along with some
short README and a link to Usenet article dealing with the same
subject is available at:
http://www.boblycat.org/~malc/apc

Thanks


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/8] lguest: trivial guest block driver

2007-02-11 Thread Jens Axboe
On Mon, Feb 12 2007, Jens Axboe wrote:
> On Mon, Feb 12 2007, Rusty Russell wrote:
> > On Mon, 2007-02-12 at 05:43 +0100, Jens Axboe wrote:
> > > On Mon, Feb 12 2007, Rusty Russell wrote:
> > > > +   end_request(bd->req, bd->lb_page->result == 1);
> > >
> > > You are using the old-style end request handling. So while I generally
> > > discourage use of end_request(), you seem to have a bigger problem here:
> > 
> > > > +   rq_for_each_bio(bio, req) {
> > > > +   struct bio_vec *bvec;
> > > > +   bio_for_each_segment(bvec, bio, idx) {
> > > > +   BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
> > > > +   BUG_ON(!bvec->bv_len);
> > > > +   dma->addr[i] = page_to_phys(bvec->bv_page)
> > > > +   + bvec->bv_offset;
> > > > +   dma->len[i] = bvec->bv_len;
> > > > +   len += bvec->bv_len;
> > > > +   i++;
> > > > +   }
> > > > +   }
> > > > +   if (i < LGUEST_MAX_DMA_SECTIONS)
> > > > +   dma->len[i] = 0;
> > > > +   return len;
> > > > +}
> > > 
> > > Here you map the entire request (lets call that segment A..Z), but
> > > end_request() only completes the first chunk of the request. So
> > > elv_next_request() will retrieve the same request again, and you'll then
> > > map B..Z and repeat that transfer. So unless I'm missing some other part
> > > here (just read it over quickly), you are re-doing large parts of a
> > > merged request several times.
> > > 
> > > So: don't use end_request(). Add some driver helper that does:
> > > 
> > > static void lgb_end_request(struct blockdev *bd)
> > > {
> > > int uptodate = bd->lb_page->result == 1;
> > > struct request *rq = bd->req;
> > > 
> > > end_that_request_first(rq, uptodate, req->hard_nr_sectors);
> > > add_disk_randomness(rq->rq_disk);
> > > blkdev_dequeue_request(rq);
> > > end_that_request_last(rq, uptodate);
> > > }
> > > 
> > > We could probably even make that a block layer helper, I'm sure others
> > > could be cleaned up with that as well. You want to use that helper in
> > > do_lgb_request() as well.
> > 
> > I'm confused.  That code looks like end_request:
> > 
> > void end_request(struct request *req, int uptodate)
> > {
> > if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
> > add_disk_randomness(req->rq_disk);
> > blkdev_dequeue_request(req);
> > end_that_request_last(req, uptodate);
> > }
> > }
> 
> Note hard_cur_sectors vs hard_nr_sectors. The former refers to the first
> segment sector count, the latter to the total sector count in the
> request. Hence the difference!

Also, that is why my example code doesn't check the
end_that_request_first() return value, it must be 0 or it would be
buggy. Alternatively,

if (end_that_request_first(rq, uptodate, rq->hard_nr_sectors))
BUG()

would make that more clear. Or just a nice little comment. Anyway, if
you fixup lguest I'll add a kernel helper that we can switch it to once
merged.

-- 
Jens Axboe

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/8] lguest: trivial guest block driver

2007-02-11 Thread Jens Axboe
On Mon, Feb 12 2007, Rusty Russell wrote:
> On Mon, 2007-02-12 at 05:43 +0100, Jens Axboe wrote:
> > On Mon, Feb 12 2007, Rusty Russell wrote:
> > > + end_request(bd->req, bd->lb_page->result == 1);
> >
> > You are using the old-style end request handling. So while I generally
> > discourage use of end_request(), you seem to have a bigger problem here:
> 
> > > + rq_for_each_bio(bio, req) {
> > > + struct bio_vec *bvec;
> > > + bio_for_each_segment(bvec, bio, idx) {
> > > + BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
> > > + BUG_ON(!bvec->bv_len);
> > > + dma->addr[i] = page_to_phys(bvec->bv_page)
> > > + + bvec->bv_offset;
> > > + dma->len[i] = bvec->bv_len;
> > > + len += bvec->bv_len;
> > > + i++;
> > > + }
> > > + }
> > > + if (i < LGUEST_MAX_DMA_SECTIONS)
> > > + dma->len[i] = 0;
> > > + return len;
> > > +}
> > 
> > Here you map the entire request (lets call that segment A..Z), but
> > end_request() only completes the first chunk of the request. So
> > elv_next_request() will retrieve the same request again, and you'll then
> > map B..Z and repeat that transfer. So unless I'm missing some other part
> > here (just read it over quickly), you are re-doing large parts of a
> > merged request several times.
> > 
> > So: don't use end_request(). Add some driver helper that does:
> > 
> > static void lgb_end_request(struct blockdev *bd)
> > {
> > int uptodate = bd->lb_page->result == 1;
> > struct request *rq = bd->req;
> > 
> > end_that_request_first(rq, uptodate, req->hard_nr_sectors);
> > add_disk_randomness(rq->rq_disk);
> > blkdev_dequeue_request(rq);
> > end_that_request_last(rq, uptodate);
> > }
> > 
> > We could probably even make that a block layer helper, I'm sure others
> > could be cleaned up with that as well. You want to use that helper in
> > do_lgb_request() as well.
> 
> I'm confused.  That code looks like end_request:
> 
> void end_request(struct request *req, int uptodate)
> {
>   if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
>   add_disk_randomness(req->rq_disk);
>   blkdev_dequeue_request(req);
>   end_that_request_last(req, uptodate);
>   }
> }

Note hard_cur_sectors vs hard_nr_sectors. The former refers to the first
segment sector count, the latter to the total sector count in the
request. Hence the difference!

-- 
Jens Axboe

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ia64: Fix noncoherent DMA API so devres builds

2007-02-11 Thread Roland Dreier
On ia64, drivers/base/dma-mapping.c doesn't build because it calls
dma_alloc_noncoherent() and dma_free_noncoherent(), which appear to be
terminally broken; the calls end up generating errors like

drivers/base/dma-mapping.c: In function 'dmam_noncoherent_release':
drivers/base/dma-mapping.c:32: error: 'struct ia64_machine_vector' has no 
member named 'platform_dma_free_coherent'

because the multiple levels of macro expansion in 
and  end up turning a call to dma_free_noncoherent()
into ia64_mv.platform_dma_free_coherent (instead of the intended
ia64_mv.dma_free_coherent).

This patch fixes this by converting dma_{alloc,free}_noncoherent()
into inline functions that call the corresponding coherent functions,
instead of trying to do this with macros.

Signed-off-by: Roland Dreier <[EMAIL PROTECTED]>
---
 include/asm-ia64/dma-mapping.h |   15 +--
 1 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/asm-ia64/dma-mapping.h b/include/asm-ia64/dma-mapping.h
index ebd5887..6299b51 100644
--- a/include/asm-ia64/dma-mapping.h
+++ b/include/asm-ia64/dma-mapping.h
@@ -8,9 +8,20 @@
 #include 
 
 #define dma_alloc_coherent platform_dma_alloc_coherent
-#define dma_alloc_noncoherent  platform_dma_alloc_coherent /* coherent 
mem. is cheap */
+/* coherent mem. is cheap */
+static inline void *
+dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t flag)
+{
+   return dma_alloc_coherent(dev, size, dma_handle, flag);
+}
 #define dma_free_coherent  platform_dma_free_coherent
-#define dma_free_noncoherent   platform_dma_free_coherent
+static inline void
+dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
+dma_addr_t dma_handle)
+{
+   dma_free_coherent(dev, size, cpu_addr, dma_handle);
+}
 #define dma_map_single platform_dma_map_single
 #define dma_map_sg platform_dma_map_sg
 #define dma_unmap_single   platform_dma_unmap_single
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/8] lguest: trivial guest block driver

2007-02-11 Thread Rusty Russell
On Mon, 2007-02-12 at 05:43 +0100, Jens Axboe wrote:
> On Mon, Feb 12 2007, Rusty Russell wrote:
> > +   end_request(bd->req, bd->lb_page->result == 1);
>
> You are using the old-style end request handling. So while I generally
> discourage use of end_request(), you seem to have a bigger problem here:

> > +   rq_for_each_bio(bio, req) {
> > +   struct bio_vec *bvec;
> > +   bio_for_each_segment(bvec, bio, idx) {
> > +   BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
> > +   BUG_ON(!bvec->bv_len);
> > +   dma->addr[i] = page_to_phys(bvec->bv_page)
> > +   + bvec->bv_offset;
> > +   dma->len[i] = bvec->bv_len;
> > +   len += bvec->bv_len;
> > +   i++;
> > +   }
> > +   }
> > +   if (i < LGUEST_MAX_DMA_SECTIONS)
> > +   dma->len[i] = 0;
> > +   return len;
> > +}
> 
> Here you map the entire request (lets call that segment A..Z), but
> end_request() only completes the first chunk of the request. So
> elv_next_request() will retrieve the same request again, and you'll then
> map B..Z and repeat that transfer. So unless I'm missing some other part
> here (just read it over quickly), you are re-doing large parts of a
> merged request several times.
> 
> So: don't use end_request(). Add some driver helper that does:
> 
> static void lgb_end_request(struct blockdev *bd)
> {
> int uptodate = bd->lb_page->result == 1;
> struct request *rq = bd->req;
> 
> end_that_request_first(rq, uptodate, req->hard_nr_sectors);
> add_disk_randomness(rq->rq_disk);
> blkdev_dequeue_request(rq);
> end_that_request_last(rq, uptodate);
> }
> 
> We could probably even make that a block layer helper, I'm sure others
> could be cleaned up with that as well. You want to use that helper in
> do_lgb_request() as well.

I'm confused.  That code looks like end_request:

void end_request(struct request *req, int uptodate)
{
if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
add_disk_randomness(req->rq_disk);
blkdev_dequeue_request(req);
end_that_request_last(req, uptodate);
}
}

Rusty.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: NAK new drivers without proper power management?

2007-02-11 Thread Willy Tarreau
On Mon, Feb 12, 2007 at 12:26:52AM +, Alan wrote:
> > Unless I'm mistaken, I have to type the passphrase twice then :
> >   - once at suspend
> >   - once at resume
> > 
> > which is once more per "boot" than what I'm doing on loop-aes.
> 
> You don't need to type in a key at suspend time if you don't want to.
> Think about gpg email - I can send you an encrypted email without typing
> any keys, you need the right key however to read it.

OK, so that means that it can generate a random secret which is crypted
with your public key. That's very different from a crypted FS which has
to keep the same secret key over time for obvious reasons, but it is
smart to proceed this way.

One less "myth" as Nigel would say call it ;-)

Thanks Alan for clarifications
Willy

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: What are the real ioapic rte programming constraints?

2007-02-11 Thread Eric W. Biederman
Zwane Mwaikambo <[EMAIL PROTECTED]> writes:

> On Sun, 11 Feb 2007, Eric W. Biederman wrote:
>
>> > 2.15.2 PCI Express* Legacy INTx Support and Boot Interrupt
>> > http://download.intel.com/design/chipsets/datashts/30262802.pdf
>> 
>> Ouch.  And this kind of thing isn't exactly uncommon.
>> 
>> However if we have the irqs also disabled in the i8259 we should
>> be safe from actually receiving this interrupt (even if it generates
>> bus traffic), and when we enable the irq since it is level triggered  
>> we should still get an interrupt message.
>>
>> It isn't immediately obvious where the i8259 irq enable/disable
>> happens.  So i"m having trouble auditing that bit of code.
>>
>> Plus we can get very strange things like the irq number changing
>> and the sharing rules being different when going through the i8259.
>> So irqN may be irqM when going through the i8259.
>> 
>> As long as we aren't using anything on the i8259 including the timer
>> in ExtINT mode we can disable every interrupt pin and not worry about
>> interrupts from that source.
>
> We do the 8259 mask in setup_IO_APIC_irq. does anyone have access to an 
> E7520/E7320 system for testing?

I think I do, I need to double check. 

The thing is this logic is different in that it uses INTx instead of pins
but otherwise is quite standard for chipsets and their IOAPICs.  I'm not
at all certain this behavior is what the original concern was about.
The description is enough different you may have found a completely
different set of behavior we have to worry about.

Since the legacy/non legacy behavior is common invoked by the ioapic
mask bit working with just about any recent chipset should get a taste
of that.  But I will still try and dig up an E7520 and see what
happens.

Eric
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH] QRCU fastpath optimization

2007-02-11 Thread Paul E. McKenney
This patch optimizes the "quick" RCU update-side fastpath, so that in the
absence of readers, synchronize_qrcu() does four non-atomic comparisons
and three memory barriers, eliminating the need to acquire the global
lock in this case.  Lightly tested.  Algorithm has been validated for
the 3-reader-2-updater and 2-reader-3-updater cases -- 3-readers-3-updaters
case still to be done (I expect to get access to a large-memory machine
in the next few weeks -- need >>20GB).

Not for inclusion.  Patch is against Oleg's original patch, and likely
needs to be rediffed against Jen's patchstack.  I will do this rediffing
later, first want an easy-to-test and easy-to-inpect version.

Signed-off-by: Paul E. McKenney <[EMAIL PROTECTED]>
---

 srcu.c |   42 +-
 1 file changed, 37 insertions(+), 5 deletions(-)

diff -urpNa -X dontdiff linux-2.6.19-qrcu/kernel/srcu.c 
linux-2.6.19-qrcu-fp/kernel/srcu.c
--- linux-2.6.19-qrcu/kernel/srcu.c 2007-02-05 16:27:50.0 -0800
+++ linux-2.6.19-qrcu-fp/kernel/srcu.c  2007-02-11 18:10:35.0 -0800
@@ -287,23 +287,55 @@ void synchronize_qrcu(struct qrcu_struct
 {
int idx;
 
-   smp_mb();
+   smp_mb();  /* Force preceding change to happen before fastpath check. */
+
+   /*
+* Fastpath: If the two counters sum to "1" at a given point in
+* time, there are no readers.  However, it takes two separate
+* loads to sample both counters, which won't occur simultaneously.
+* So we might race with a counter switch, so that we might see
+* ctr[0]==0, then the counter might switch, then we might see
+* ctr[1]==1 (unbeknownst to us because there is a reader still
+* there).  So we do a read memory barrier and recheck.  If the
+* same race happens again, there must have been a second counter
+* switch.  This second counter switch could not have happened
+* until all preceding readers finished, so if the condition
+* is true both times, we may safely proceed.
+*
+* This relies critically on the atomic increment and atomic
+* decrement being seen as executing in order.
+*/
+
+   if (atomic_read(>ctr[0]) + atomic_read(>ctr[1]) <= 1) {
+   smp_rmb();  /* Keep two checks independent. */
+   if (atomic_read(>ctr[0]) + atomic_read(>ctr[1]) <= 1)
+   goto out;
+   }
+
mutex_lock(>mutex);
 
idx = qp->completed & 0x1;
if (atomic_read(qp->ctr + idx) == 1)
-   goto out;
+   goto out_unlock;
 
atomic_inc(qp->ctr + (idx ^ 0x1));
-   /* Reduce the likelihood that qrcu_read_lock() will loop */
+
+   /*
+* Prevent subsequent decrement from being seen before previous
+* increment -- such an inversion could cause the fastpath
+* above to falsely conclude that there were no readers.  Also,
+* reduce the likelihood that qrcu_read_lock() will loop.
+*/
+
smp_mb__after_atomic_inc();
qp->completed++;
 
atomic_dec(qp->ctr + idx);
__wait_event(qp->wq, !atomic_read(qp->ctr + idx));
-out:
+out_unlock:
mutex_unlock(>mutex);
-   smp_mb();
+out:
+   smp_mb(); /* force subsequent free after qrcu_read_unlock(). */
 }
 
 EXPORT_SYMBOL_GPL(init_qrcu_struct);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/8] lguest: trivial guest block driver

2007-02-11 Thread Jens Axboe
On Mon, Feb 12 2007, Rusty Russell wrote:
> +static irqreturn_t lgb_irq(int irq, void *_bd)
> +{
> + struct blockdev *bd = _bd;
> + unsigned long flags;
> +
> + if (!bd->req) {
> + pr_debug("No work!\n");
> + return IRQ_NONE;
> + }
> +
> + if (!bd->lb_page->result) {
> + pr_debug("No result!\n");
> + return IRQ_NONE;
> + }
> +
> + spin_lock_irqsave(>lock, flags);
> + end_request(bd->req, bd->lb_page->result == 1);
> + bd->req = NULL;
> + bd->dma.used_len = 0;
> + blk_start_queue(bd->disk->queue);
> + spin_unlock_irqrestore(>lock, flags);
> + return IRQ_HANDLED;
> +}

You are using the old-style end request handling. So while I generally
discourage use of end_request(), you seem to have a bigger problem here:

> +static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
> +{
> + unsigned int i = 0, idx, len = 0;
> + struct bio *bio;
> +
> + rq_for_each_bio(bio, req) {
> + struct bio_vec *bvec;
> + bio_for_each_segment(bvec, bio, idx) {
> + BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
> + BUG_ON(!bvec->bv_len);
> + dma->addr[i] = page_to_phys(bvec->bv_page)
> + + bvec->bv_offset;
> + dma->len[i] = bvec->bv_len;
> + len += bvec->bv_len;
> + i++;
> + }
> + }
> + if (i < LGUEST_MAX_DMA_SECTIONS)
> + dma->len[i] = 0;
> + return len;
> +}

Here you map the entire request (lets call that segment A..Z), but
end_request() only completes the first chunk of the request. So
elv_next_request() will retrieve the same request again, and you'll then
map B..Z and repeat that transfer. So unless I'm missing some other part
here (just read it over quickly), you are re-doing large parts of a
merged request several times.

So: don't use end_request(). Add some driver helper that does:

static void lgb_end_request(struct blockdev *bd)
{
int uptodate = bd->lb_page->result == 1;
struct request *rq = bd->req;

end_that_request_first(rq, uptodate, req->hard_nr_sectors);
add_disk_randomness(rq->rq_disk);
blkdev_dequeue_request(rq);
end_that_request_last(rq, uptodate);
}

We could probably even make that a block layer helper, I'm sure others
could be cleaned up with that as well. You want to use that helper in
do_lgb_request() as well.

-- 
Jens Axboe

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: NAK new drivers without proper power management?

2007-02-11 Thread Nigel Cunningham
Howdy!

On Mon, 2007-02-12 at 01:10 +0100, Tilman Schmidt wrote:
> Hi,
> 
> Am 11.02.2007 23:37 schrieb Nigel Cunningham:
> > On Sun, 2007-02-11 at 00:45 +0100, Tilman Schmidt wrote:
> >> Am 10.02.2007 23:37 schrieb Nigel Cunningham:
> >>> If your device requires power management, and you know it requires power
> >>> management, why not just implement power management? [...]
> >> Like it or not, power management is far from trivial, and people
> >> writing device drivers have limited resources. [...]
> > It's not that complex. All we're really talking about is a bit of extra
> > code to cleanup and configure hardware state; things that the driver
> > author already knows how to do. S3 might require a bit more
> > initialisation if firmware needs to be reloaded or more extensive
> > configuration needs to be done, but if there's firmware to be loaded,
> > there is a reasonably good probability that we loaded it from Linux to
> > start with anyway.
> 
> You are assuming a perfect world where driver authors have complete
> knowledge of their devices. In reality, many drivers (including
> those I have the mixed pleasure of maintaining) are based at least
> in part on reverse engineering, and managing power states may well
> fall into the domain of things not yet sufficiently reverse
> engineered.

Nope. I'm assuming that the driver author knows what needs to be done to
get the driver out of whatever state the BIOS puts it in to start with,
and into an operational state, and that they therefore also know what
needs to be done to take it out of the operational state again. I'm
admitting that there's also another state - the post suspend-to-ram
driver state - that they may not know how to deal with. But for
suspend-to-disk, if you know how to get the driver to work in the first
place, you know enough to stop it working (.suspend) and start it up
again (.resume) for the hibernate case at least.

I'm not assuming that you know enough to be able to put the driver into
a low state and get it out again. This is definitely preferable, and at
least possibly essential for suspend to ram, but for some unknown reason
I'm quite hibernation focused, and for that, just the above is
sufficient.

> >> Also, in your argument you neglected a few cases:
> >> - What if my device does not require power management?
> > 
> > Then you as a generic routine that does nothing but return success
> > (potentially shared with other drivers that are in the same situation).
> 
> But if I just write an empty routine like that I open myself up to
> criticism along the lines of "writing dummy routines just in order
> to shut up kernel warnings". BTDT.

Well, it might not be completely empty. I think someone already pointed
out that there's a minimal workset for the pci bus that pci drivers
would want to invoke. But we wouldn't (rightly) accuse you of such
things if we decided that the policy was "Every driver ought to have a
resume routine, even if it's just a minimal I-just-work route".

> >> - What if I don't know whether my device requires power management?
> > 
> > The questions are straight forward: Is there hardware state that needs
> > to be configured if you've just booted the computer and nothing else has
> > touched it? If so, that needs to be done in a resume method. Do you need
> > to clean up state prior to doing the things in the resume method, or
> > otherwise do things to quiesce the driver? If so, they will need to be
> > done in the suspend method. The result will be roughly similar to what
> > you do for module load/unload, except maybe less complete in some cases.
> 
> I don't doubt your basic assessment. However it doesn't translate that
> easily into a real implementation. In my case, I maintain a USB driver,
> so I have to deal with USB specifics of suspend/resume which happen not
> to be that well documented. My driver provides an isdn4linux device but
> isdn4linux knows nothing about suspend/resume so I am on my own on how
> to reconcile the two. The device itself, though in turn far from trivial,
> is actually the least of my worries.

Mmm, so that's a case where we need to prod those who write
documentation and bus support first. You're probably closer! :)

Regards,

Nigel

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 7/8] lguest: trivial guest block driver

2007-02-11 Thread Rusty Russell
A simple block driver for lguest (/dev/lgbX).  Only does one request
at once.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -28,4 +28,5 @@ obj-$(CONFIG_VIODASD) += viodasd.o
 obj-$(CONFIG_VIODASD)  += viodasd.o
 obj-$(CONFIG_BLK_DEV_SX8)  += sx8.o
 obj-$(CONFIG_BLK_DEV_UB)   += ub.o
+obj-$(CONFIG_LGUEST_GUEST) += lguest_blk.o
 
===
--- /dev/null
+++ b/drivers/block/lguest_blk.c
@@ -0,0 +1,260 @@
+/* A simple block driver for lguest.
+ *
+ * Copyright 2006 Rusty Russell <[EMAIL PROTECTED]> IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+//#define DEBUG
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static char next_block_index = 'a';
+
+struct blockdev
+{
+   spinlock_t lock;
+
+   /* The disk structure for the kernel. */
+   struct gendisk *disk;
+
+   /* The major number for this disk. */
+   int major;
+   int irq;
+
+   unsigned long phys_addr;
+   /* The ioremap'ed block page. */
+   struct lguest_block_page *lb_page;
+
+   /* We only have a single request outstanding at a time. */
+   struct lguest_dma dma;
+   struct request *req;
+};
+
+static irqreturn_t lgb_irq(int irq, void *_bd)
+{
+   struct blockdev *bd = _bd;
+   unsigned long flags;
+
+   if (!bd->req) {
+   pr_debug("No work!\n");
+   return IRQ_NONE;
+   }
+
+   if (!bd->lb_page->result) {
+   pr_debug("No result!\n");
+   return IRQ_NONE;
+   }
+
+   spin_lock_irqsave(>lock, flags);
+   end_request(bd->req, bd->lb_page->result == 1);
+   bd->req = NULL;
+   bd->dma.used_len = 0;
+   blk_start_queue(bd->disk->queue);
+   spin_unlock_irqrestore(>lock, flags);
+   return IRQ_HANDLED;
+}
+
+static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
+{
+   unsigned int i = 0, idx, len = 0;
+   struct bio *bio;
+
+   rq_for_each_bio(bio, req) {
+   struct bio_vec *bvec;
+   bio_for_each_segment(bvec, bio, idx) {
+   BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
+   BUG_ON(!bvec->bv_len);
+   dma->addr[i] = page_to_phys(bvec->bv_page)
+   + bvec->bv_offset;
+   dma->len[i] = bvec->bv_len;
+   len += bvec->bv_len;
+   i++;
+   }
+   }
+   if (i < LGUEST_MAX_DMA_SECTIONS)
+   dma->len[i] = 0;
+   return len;
+}
+
+static void empty_dma(struct lguest_dma *dma)
+{
+   dma->len[0] = 0;
+}
+
+static void setup_req(struct blockdev *bd,
+ int type, struct request *req, struct lguest_dma *dma)
+{
+   bd->lb_page->type = type;
+   bd->lb_page->sector = req->sector;
+   bd->lb_page->result = 0;
+   bd->req = req;
+   bd->lb_page->bytes = req_to_dma(req, dma);
+}
+
+static int do_write(struct blockdev *bd, struct request *req)
+{
+   struct lguest_dma send;
+
+   pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
+   setup_req(bd, 1, req, );
+
+   hcall(LHCALL_SEND_DMA, bd->phys_addr, __pa(), 0);
+   return 1;
+}
+
+static int do_read(struct blockdev *bd, struct request *req)
+{
+   struct lguest_dma ping;
+
+   pr_debug("lgb: READ sector %li\n", (long)req->sector);
+   setup_req(bd, 0, req, >dma);
+
+   empty_dma();
+   hcall(LHCALL_SEND_DMA,bd->phys_addr,__pa(),0);
+   return 1;
+}
+
+static void do_lgb_request(request_queue_t *q)
+{
+   struct blockdev *bd;
+   struct request *req;
+   int ok;
+
+again:
+   req = elv_next_request(q);
+   if (!req)
+   return;
+
+   bd = req->rq_disk->private_data;
+   /* Sometimes we get repeated requests after blk_stop_queue. */
+   if (bd->req)
+   return;
+
+   if (!blk_fs_request(req)) {
+   pr_debug("Got non-command 0x%08x\n", req->cmd_type);
+   error:
+   req->errors++;
+   end_request(req, 0);
+   goto again;
+   } 

[PATCH 8/8] lguest: documentatation and example launcher

2007-02-11 Thread Rusty Russell
Fairly complete documentation for lguest.  I actually want to get rid
of the "coding" part of lguest.txt and roll it into the code itself,
literary-programming-style.

The launcher utility is also here: I don't have delusions of interface
stability, so it makes sense to have it here as an example, and it's
only 1000 lines.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

diff -r 8806a441a0b1 Documentation/dontdiff
--- a/Documentation/dontdiffMon Feb 12 13:02:02 2007 +1100
+++ b/Documentation/dontdiffMon Feb 12 13:47:43 2007 +1100
@@ -144,3 +144,6 @@ wanxlfw.inc
 wanxlfw.inc
 uImage
 zImage
+hypervisor-blob.c
+lguest.lds
+hypervisor-raw
diff -r 8806a441a0b1 Documentation/lguest/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/Documentation/lguest/Makefile Mon Feb 12 13:48:13 2007 +1100
@@ -0,0 +1,21 @@
+# This creates the demonstration utility "lguest" which runs a Linux guest.
+
+# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
+# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
+include ../../.config
+LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x0800)
+
+CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
+   -static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
+LDLIBS:=-lz
+
+all: lguest.lds lguest
+
+# The linker script on x86 is so complex the only way of creating one
+# which will link our binary in the right place is to mangle the
+# default one.
+lguest.lds:
+   $(LD) --verbose | awk '/^==/ { PRINT=1; next; } 
/SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) 
print $$0; }' > $@
+
+clean:
+   rm -f lguest.lds lguest
diff -r 8806a441a0b1 Documentation/lguest/lguest.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/Documentation/lguest/lguest.c Mon Feb 12 13:47:43 2007 +1100
@@ -0,0 +1,989 @@
+/* Simple program to layout "physical" memory for new lguest guest.
+ * Linked high to avoid likely physical memory.  */
+#define _LARGEFILE64_SOURCE
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+#include "../../include/asm/lguest_user.h"
+
+#define PAGE_PRESENT 0x7   /* Present, RW, Execute */
+#define NET_PEERNUM 1
+
+static bool verbose;
+#define verbose(args...) \
+   do { if (verbose) printf(args); fflush(stdout); } while(0)
+
+struct devices
+{
+   fd_set infds;
+   int max_infd;
+
+   struct device *dev;
+};
+
+struct device
+{
+   struct device *next;
+   struct lguest_device_desc *desc;
+   void *mem;
+
+   /* Watch this fd if handle_input non-NULL. */
+   int fd;
+   int (*handle_input)(int fd, struct device *me);
+
+   /* Watch DMA to this address if handle_input non-NULL. */
+   unsigned long watch_address;
+   u32 (*handle_output)(int fd, const struct iovec *iov,
+unsigned int num, struct device *me);
+
+   /* Device-specific data. */
+   void *priv;
+};
+
+static char buf[1024];
+static struct iovec discard_iov = { .iov_base=buf, .iov_len=sizeof(buf) };
+static int zero_fd;
+
+static u32 memparse(const char *ptr)
+{
+   char *end;
+   unsigned long ret = strtoul(ptr, , 0);
+
+   switch (*end) {
+   case 'G':
+   case 'g':
+   ret <<= 10;
+   case 'M':
+   case 'm':
+   ret <<= 10;
+   case 'K':
+   case 'k':
+   ret <<= 10;
+   end++;
+   default:
+   break;
+   }
+   return ret;
+}
+
+static inline unsigned long page_align(unsigned long addr)
+{
+   return ((addr + getpagesize()-1) & ~(getpagesize()-1));
+}
+
+/* initrd gets loaded at top of memory: return length. */
+static unsigned long load_initrd(const char *name, unsigned long end)
+{
+   int ifd;
+   struct stat st;
+   void *iaddr;
+
+   if (!name)
+   return 0;
+
+   ifd = open(name, O_RDONLY, 0);
+   if (ifd < 0)
+   err(1, "Opening initrd '%s'", name);
+   
+   if (fstat(ifd, ) < 0)
+   err(1, "fstat() on initrd '%s'", name);
+
+   iaddr = mmap((void *)end - st.st_size, st.st_size,
+PROT_READ|PROT_EXEC|PROT_WRITE,
+MAP_FIXED|MAP_PRIVATE, ifd, 0);
+   if (iaddr != (void *)end - st.st_size)
+   err(1, "Mmaping initrd '%s' returned %p not %p",
+   name, iaddr, (void *)end - st.st_size);
+   close(ifd);
+   verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
+   return st.st_size;
+}
+
+/* First map /dev/zero over entire memory, then insert kernel. */
+static void map_memory(unsigned 

[PATCH 6/8] lguest: trivial guest console driver

2007-02-11 Thread Rusty Russell
A trivial driver to have a basic lguest console, using the hvc_console
infrastructure.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

diff -r aaa62bd9788a drivers/char/Makefile
--- a/drivers/char/Makefile Mon Feb 12 13:01:16 2007 +1100
+++ b/drivers/char/Makefile Mon Feb 12 13:01:45 2007 +1100
@@ -44,6 +44,7 @@ obj-$(CONFIG_SX)  += sx.o generic_serial
 obj-$(CONFIG_SX)   += sx.o generic_serial.o
 obj-$(CONFIG_RIO)  += rio/ generic_serial.o
 obj-$(CONFIG_HVC_CONSOLE)  += hvc_vio.o hvsi.o
+obj-$(CONFIG_LGUEST_GUEST) += hvc_lguest.o
 obj-$(CONFIG_HVC_ISERIES)  += hvc_iseries.o
 obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o
 obj-$(CONFIG_HVC_DRIVER)   += hvc_console.o
diff -r aaa62bd9788a drivers/char/hvc_lguest.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/drivers/char/hvc_lguest.c Mon Feb 12 13:01:19 2007 +1100
@@ -0,0 +1,99 @@
+/* Simple console for lguest.
+ *
+ * Copyright (C) 2006 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include 
+#include 
+#include 
+#include "hvc_console.h"
+
+static int cons_irq;
+static int cons_offset;
+static char inbuf[256];
+static struct lguest_dma cons_input = { .used_len = 0,
+   .addr[0] = __pa(inbuf),
+   .len[0] = sizeof(inbuf),
+   .len[1] = 0 };
+
+static int get_chars(u32 vtermno, char *buf, int count)
+{
+   if (!cons_input.used_len)
+   return 0;
+
+   if (cons_input.used_len - cons_offset < count)
+   count = cons_input.used_len - cons_offset;
+
+   memcpy(buf, inbuf + cons_offset, count);
+   cons_offset += count;
+   if (cons_offset == cons_input.used_len) {
+   cons_offset = 0;
+   cons_input.used_len = 0;
+   }
+   return count;
+}
+
+static int put_chars(u32 vtermno, const char *buf, int count)
+{
+   struct lguest_dma dma;
+
+   /* FIXME: what if it's over a page boundary? */
+   dma.len[0] = count;
+   dma.len[1] = 0;
+   dma.addr[0] = __pa(buf);
+
+   hcall(LHCALL_SEND_DMA, 4, __pa(), 0);
+   return count;
+}
+
+struct hv_ops lguest_cons = {
+   .get_chars = get_chars,
+   .put_chars = put_chars,
+};
+
+static int __init cons_init(void)
+{
+   if (strcmp(paravirt_ops.name, "lguest") != 0)
+   return 0;
+
+   return hvc_instantiate(0, 0, _cons);
+}
+console_initcall(cons_init);
+
+static int lguestcons_probe(struct lguest_device *lhdev)
+{
+   cons_irq = lhdev->index+1;
+   lhdev->private = hvc_alloc(0, cons_irq, _cons, 256);
+   if (IS_ERR(lhdev->private))
+   return PTR_ERR(lhdev->private);
+
+   if (!hcall(LHCALL_BIND_DMA, 0, __pa(_input), (1<<8)+cons_irq))
+   printk("lguest console: failed to bind buffer.\n");
+   return 0;
+}
+
+static struct lguest_driver lguestcons_drv = {
+   .name = "lguestcons",
+   .owner = THIS_MODULE,
+   .device_type = LGUEST_DEVICE_T_CONSOLE,
+   .probe = lguestcons_probe,
+};
+
+static int __init hvc_lguest_init(void)
+{
+   return register_lguest_driver(_drv);
+}
+module_init(hvc_lguest_init);


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/8] lguest: trivial guest network driver

2007-02-11 Thread Rusty Russell
This network driver operates both to the host process, and to other
guests.  It's pretty trivial.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -217,3 +217,4 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
 obj-$(CONFIG_FS_ENET) += fs_enet/
 
 obj-$(CONFIG_NETXEN_NIC) += netxen/
+obj-$(CONFIG_LGUEST_GUEST) += lguest_net.o
===
--- /dev/null
+++ b/drivers/net/lguest_net.c
@@ -0,0 +1,400 @@
+/* A simple network driver for lguest.
+ *
+ * Copyright 2006 Rusty Russell <[EMAIL PROTECTED]> IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+//#define DEBUG
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SHARED_SIZEPAGE_SIZE
+#define DATA_SIZE  1500
+#define MAX_LANS   4
+#define NUM_SKBS   8
+/* We overload multicast bit to show promiscuous mode. */
+#define PROMISC_BIT0x80
+
+struct lguestnet_info
+{
+   /* The shared page. */
+   struct lguest_net *peer;
+   unsigned long peer_phys;
+
+   /* My peerid. */
+   unsigned int me;
+
+   struct net_device_stats stats;
+
+   /* Receive queue. */
+   struct sk_buff *skb[NUM_SKBS];
+   struct lguest_dma dma[NUM_SKBS];
+};
+
+/* How many bytes left in this page. */
+static unsigned int rest_of_page(void *data)
+{
+   return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
+}
+
+/* Simple convention: offset 4 * peernum. */
+static unsigned long peer_addr(struct lguestnet_info *info, unsigned peernum)
+{
+   return info->peer_phys + 4 * peernum;
+}
+
+static void skb_to_dma(const struct sk_buff *skb, unsigned int len,
+  struct lguest_dma *dma)
+{
+   unsigned int i, seg;
+
+   for (i = seg = 0; i < len; seg++, i += rest_of_page(skb->data + i)) {
+   dma->addr[seg] = virt_to_phys(skb->data + i);
+   dma->len[seg] = min((unsigned)(len - i),
+   rest_of_page(skb->data + i));
+   }
+   for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) {
+   const skb_frag_t *f = _shinfo(skb)->frags[i];
+   /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */
+   if (seg == LGUEST_MAX_DMA_SECTIONS) {
+   printk("Woah dude!  Megapacket!\n");
+   break;
+   }
+   dma->addr[seg] = page_to_phys(f->page) + f->page_offset;
+   dma->len[seg] = f->size;
+   }
+   if (seg < LGUEST_MAX_DMA_SECTIONS)
+   dma->len[seg] = 0;
+}
+
+static void transfer_packet(struct net_device *dev,
+   struct sk_buff *skb,
+   unsigned int peernum)
+{
+   struct lguestnet_info *info = dev->priv;
+   struct lguest_dma dma;
+
+   skb_to_dma(skb, skb->len, );
+   pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len);
+
+   hcall(LHCALL_SEND_DMA, peer_addr(info,peernum), __pa(),0);
+   if (dma.used_len != skb->len) {
+   info->stats.tx_carrier_errors++;
+   pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n",
+peernum, dma.used_len, skb->len,
+(void *)dma.addr[0], dma.len[0]);
+   } else {
+   pr_debug("lguestnet: sent %u bytes\n", skb->len);
+   info->stats.tx_bytes += skb->len;
+   info->stats.tx_packets++;
+   }
+}
+
+static int mac_eq(const unsigned char mac[ETH_ALEN],
+ struct lguestnet_info *info, unsigned int peer)
+{
+   /* Ignore multicast bit, which peer turns on to mean promisc. */
+   if ((info->peer[peer].promisc & (~PROMISC_BIT)) != mac[0])
+   return 0;
+   return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0;
+}
+
+static int unused_peer(const struct lguest_net peer[], unsigned int num)
+{
+   return peer[num].guestid == 0x;
+}
+
+static int is_broadcast(const unsigned char dest[ETH_ALEN])
+{
+   return dest[0] == 0xFF && dest[1] == 0xFF && dest[2] == 0xFF
+   && dest[3] == 0xFF && dest[4] == 0xFF && dest[5] == 

[PATCH 4/8] lguest: Makefile

2007-02-11 Thread Rusty Russell
Finally, we put in the Makefile, so it will build.

Linking the switcher code (hypervisor.S) ready to be copied
into the top of memory is the only non-trivial thing here.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -108,6 +108,7 @@ drivers-$(CONFIG_PCI)   += arch/i386/pci
 # must be linked after kernel/
 drivers-$(CONFIG_OPROFILE) += arch/i386/oprofile/
 drivers-$(CONFIG_PM)   += arch/i386/power/
+drivers-$(CONFIG_LGUEST_GUEST) += arch/i386/lguest/
 
 CFLAGS += $(mflags-y)
 AFLAGS += $(mflags-y)
===
--- /dev/null
+++ b/arch/i386/lguest/Makefile
@@ -0,0 +1,22 @@
+# Guest requires the paravirt_ops replacement and the bus driver.
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_bus.o
+
+# Host requires the other files, which can be a module.
+obj-$(CONFIG_LGUEST)   += lg.o
+lg-objs := core.o hypercalls.o page_tables.o interrupts_and_traps.o \
+   segments.o io.o lguest_user.o
+
+# We use top 4MB for guest traps page, then hypervisor. */
+HYPE_ADDR := (0xFFC0+4096)
+# The data is only 1k (256 interrupt handler pointers)
+HYPE_DATA_SIZE := 1024
+CFLAGS += -DHYPE_ADDR="$(HYPE_ADDR)" -DHYPE_DATA_SIZE="$(HYPE_DATA_SIZE)"
+
+$(obj)/core.o: $(obj)/hypervisor-blob.c
+# This links the hypervisor in the right place and turns it into a C array.
+$(obj)/hypervisor-raw: $(obj)/hypervisor.o
+   @$(LD) -static -Tdata=`printf %#x $$(($(HYPE_ADDR)))` -Ttext=`printf 
%#x $$(($(HYPE_ADDR)+$(HYPE_DATA_SIZE)))` -o $@ $< && $(OBJCOPY) -O binary $@
+$(obj)/hypervisor-blob.c: $(obj)/hypervisor-raw
+   @od -tx1 -An -v $< | sed -e 's/^ /0x/' -e 's/$$/,/' -e 's/ /,0x/g' > $@
+
+clean-files := hypervisor-blob.c hypervisor-raw


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/8] lguest: Guest code

2007-02-11 Thread Rusty Russell
This is the guest code which replaces the parts of paravirt_ops with
hypercalls.  It's fairly trivial.  This patch also includes trivial
bus driver for lguest devices, and an extern declarations for boot_pda
(previously frobbed only from head.S).

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

diff -r 3c4f57d11d07 arch/i386/lguest/lguest.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/arch/i386/lguest/lguest.c Mon Feb 12 14:21:57 2007 +1100
@@ -0,0 +1,562 @@
+/*
+ * Lguest specific paravirt-ops implementation
+ *
+ * Copyright (C) 2006, Rusty Russell <[EMAIL PROTECTED]> IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct lguest_data lguest_data;
+struct lguest_device_desc *lguest_devices;
+static __initdata const struct lguest_boot_info *boot = __va(0);
+
+void async_hcall(unsigned long call,
+unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+   /* Note: This code assumes we're uniprocessor. */
+   static unsigned int next_call;
+   unsigned long flags;
+
+   local_irq_save(flags);
+   if (lguest_data.hcall_status[next_call] != 0xFF) {
+   /* Table full, so do normal hcall which will flush table. */
+   hcall(call, arg1, arg2, arg3);
+   } else {
+   lguest_data.hcalls[next_call].eax = call;
+   lguest_data.hcalls[next_call].edx = arg1;
+   lguest_data.hcalls[next_call].ebx = arg2;
+   lguest_data.hcalls[next_call].ecx = arg3;
+   wmb();
+   lguest_data.hcall_status[next_call] = 0;
+   if (++next_call == LHCALL_RING_SIZE)
+   next_call = 0;
+   }
+   local_irq_restore(flags);
+}
+
+#ifdef PARAVIRT_LAZY_NONE  /* Not in 2.6.20. */
+static int lazy_mode;
+static void fastcall lguest_lazy_mode(int mode)
+{
+   lazy_mode = mode;
+   if (mode == PARAVIRT_LAZY_NONE)
+   hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+}
+
+static void lazy_hcall(unsigned long call,
+  unsigned long arg1,
+  unsigned long arg2,
+  unsigned long arg3)
+{
+   if (lazy_mode == PARAVIRT_LAZY_NONE)
+   hcall(call, arg1, arg2, arg3);
+   else
+   async_hcall(call, arg1, arg2, arg3);
+}
+#else
+#define lazy_hcall hcall
+#endif
+
+static unsigned long fastcall save_fl(void)
+{
+   return lguest_data.irq_enabled;
+}
+
+static void fastcall restore_fl(unsigned long flags)
+{
+   /* FIXME: Check if interrupt pending... */
+   lguest_data.irq_enabled = flags;
+}
+
+static void fastcall irq_disable(void)
+{
+   lguest_data.irq_enabled = 0;
+}
+
+static void fastcall irq_enable(void)
+{
+   /* Linux i386 code expects bit 9 set. */
+   /* FIXME: Check if interrupt pending... */
+   lguest_data.irq_enabled = 512;
+}
+
+static void fastcall lguest_load_gdt(const struct Xgt_desc_struct *desc)
+{
+   BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
+   hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
+}
+
+static void fastcall lguest_load_idt(const struct Xgt_desc_struct *desc)
+{
+   unsigned int i;
+   struct desc_struct *idt = (void *)desc->address;
+
+   for (i = 0; i < (desc->size+1)/8; i++)
+   hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+}
+
+static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+   hcall(LHCALL_CRASH, __pa(p), 0, 0);
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+   .notifier_call = lguest_panic
+};
+
+static char *lguest_memory_setup(void)
+{
+   /* We do this here because lockcheck barfs if before start_kernel */
+   atomic_notifier_chain_register(_notifier_list, );
+
+   e820.nr_map = 0;
+   add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+   return "LGUEST";
+}
+
+static fastcall void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
+unsigned int *ecx, unsigned int *edx)
+{
+   int is_feature = (*eax 

[PATCH 1/8] lguest: Kconfig and headers

2007-02-11 Thread Rusty Russell
Unfortunately, we don't have the build infrastructure for "private"
asm-offsets.h files, so there's a not-so-neat include in
arch/i386/kernel/asm-offsets.c.

The four headers are:
asm/lguest.h:
Things the guest needs to know (hypercall numbers, etc).
asm/lguest_device.h:
Things lguest devices need to know (lguest bus registration)
asm/lguest_user.h:
Things that the lguest userspace utility needs (/dev/lguest
and some devices)
arch/i386/lguest/lg.h:
Internal header for the lg module (which consists of 8 files).

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

diff -r c2ef4d061458 arch/i386/Kconfig
--- a/arch/i386/Kconfig Mon Feb 12 12:56:58 2007 +1100
+++ b/arch/i386/Kconfig Mon Feb 12 12:57:00 2007 +1100
@@ -253,6 +253,27 @@ config ES7000_CLUSTERED_APIC
depends on SMP && X86_ES7000 && MPENTIUMIII
 
 source "arch/i386/Kconfig.cpu"
+
+config LGUEST
+   tristate "Linux hypervisor example code"
+   depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE
+   select LGUEST_GUEST
+   select HVC_DRIVER
+   ---help---
+ This is a very simple module which allows you to run
+ multiple instances of the same Linux kernel, using the
+ "lguest" command found in the Documentation/lguest directory.
+ Note that "lguest" is pronounced to rhyme with "fell quest",
+ not "rustyvisor".  See Documentation/lguest/lguest.txt.
+
+ If unsure, say N.  If curious, say M.  If masochistic, say Y.
+
+config LGUEST_GUEST
+   bool
+   help
+ The guest needs code built-in, even if the host has lguest
+ support as a module.  The drivers are tiny, so we build them
+ in too.
 
 config HPET_TIMER
bool "HPET Timer Support"
diff -r c2ef4d061458 arch/i386/kernel/asm-offsets.c
--- a/arch/i386/kernel/asm-offsets.cMon Feb 12 12:56:58 2007 +1100
+++ b/arch/i386/kernel/asm-offsets.cMon Feb 12 12:57:00 2007 +1100
@@ -16,6 +16,10 @@
 #include 
 #include 
 #include 
+#ifdef CONFIG_LGUEST_GUEST
+#include 
+#include "../lguest/lg.h"
+#endif
 
 #define DEFINE(sym, val) \
 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -111,4 +115,19 @@ void foo(void)
OFFSET(PARAVIRT_iret, paravirt_ops, iret);
OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
 #endif
+
+#ifdef CONFIG_LGUEST_GUEST
+   BLANK();
+   OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+   OFFSET(LGUEST_STATE_host_stackptr, lguest_state, host.stackptr);
+   OFFSET(LGUEST_STATE_host_pgdir, lguest_state, host.pgdir);
+   OFFSET(LGUEST_STATE_host_gdt, lguest_state, host.gdt);
+   OFFSET(LGUEST_STATE_host_idt, lguest_state, host.idt);
+   OFFSET(LGUEST_STATE_regs, lguest_state, regs);
+   OFFSET(LGUEST_STATE_gdt, lguest_state, gdt);
+   OFFSET(LGUEST_STATE_idt, lguest_state, idt);
+   OFFSET(LGUEST_STATE_gdt_table, lguest_state, gdt_table);
+   OFFSET(LGUEST_STATE_trapnum, lguest_state, regs.trapnum);
+   OFFSET(LGUEST_STATE_errcode, lguest_state, regs.errcode);
+#endif
 }
diff -r c2ef4d061458 arch/i386/lguest/lg.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/arch/i386/lguest/lg.h Mon Feb 12 12:59:06 2007 +1100
@@ -0,0 +1,253 @@
+#ifndef _LGUEST_H
+#define _LGUEST_H
+
+#include 
+/* 64k ought to be enough for anybody! */
+#define HYPERVISOR_MAP_ORDER 16
+#define HYPERVISOR_PAGES ((1 << HYPERVISOR_MAP_ORDER)/PAGE_SIZE)
+
+#define GDT_ENTRY_LGUEST_CS10
+#define GDT_ENTRY_LGUEST_DS11
+#define LGUEST_CS  (GDT_ENTRY_LGUEST_CS * 8)
+#define LGUEST_DS  (GDT_ENTRY_LGUEST_DS * 8)
+
+#ifndef __ASSEMBLY__
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "irq_vectors.h"
+
+#define GUEST_DPL 1
+
+struct lguest_regs
+{
+   /* Manually saved part. */
+   u32 cr3;
+   u32 ebx, ecx, edx;
+   u32 esi, edi, ebp;
+   u32 gs;
+   u32 eax;
+   u32 fs, ds, es;
+   u32 trapnum, errcode;
+   /* Trap pushed part */
+   u32 eip;
+   u32 cs;
+   u32 eflags;
+   u32 esp;
+   u32 ss;
+};
+
+__exit void free_pagetables(void);
+__init int init_pagetables(struct page *hype_pages);
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){0x, 0x00cf9b00}) 
+#define FULL_SEGMENT ((struct desc_struct){0x, 0x00cf9300}) 
+
+/* Simplified version of IDT. */
+struct host_trap
+{
+   unsigned long addr;
+   int disable_interrupts;
+};
+
+struct lguest_dma_info
+{
+   struct list_head list;
+   union futex_key key;
+   unsigned long dmas;
+   u16 next_dma;
+   u16 num_dmas;
+   u16 guestid;
+   u8 interrupt;   /* 0 when not registered */
+};
+
+struct pgdir
+{
+   u32 cr3;
+   u32 *pgdir;
+};
+
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
+   struct lguest_state *state;
+   struct lguest_data 

[PATCH 2/2] lguest preparation: expose futex infrastructure: get_futex_key, get_key_refs and drop_key_refs

2007-02-11 Thread Rusty Russell
lguest uses the convenient futex infrastructure for inter-domain I/O,
so expose get_futex_key, get_key_refs (renamed get_futex_key_refs) and
drop_key_refs (renamed drop_futex_key_refs).  Also means we need to
expose the union that these use.

No code changes.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -100,6 +100,35 @@ extern int
 extern int
 handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
 
+/*
+ * Futexes are matched on equal values of this key.
+ * The key type depends on whether it's a shared or private mapping.
+ * Don't rearrange members without looking at hash_futex().
+ *
+ * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
+ * We set bit 0 to indicate if it's an inode-based key.
+ */
+union futex_key {
+   struct {
+   unsigned long pgoff;
+   struct inode *inode;
+   int offset;
+   } shared;
+   struct {
+   unsigned long address;
+   struct mm_struct *mm;
+   int offset;
+   } private;
+   struct {
+   unsigned long word;
+   void *ptr;
+   int offset;
+   } both;
+};
+int get_futex_key(u32 __user *uaddr, union futex_key *key);
+void get_futex_key_refs(union futex_key *key);
+void drop_futex_key_refs(union futex_key *key);
+
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
 extern void exit_pi_state_list(struct task_struct *curr);
===
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -48,37 +48,12 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "rtmutex_common.h"
 
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
-
-/*
- * Futexes are matched on equal values of this key.
- * The key type depends on whether it's a shared or private mapping.
- * Don't rearrange members without looking at hash_futex().
- *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We set bit 0 to indicate if it's an inode-based key.
- */
-union futex_key {
-   struct {
-   unsigned long pgoff;
-   struct inode *inode;
-   int offset;
-   } shared;
-   struct {
-   unsigned long address;
-   struct mm_struct *mm;
-   int offset;
-   } private;
-   struct {
-   unsigned long word;
-   void *ptr;
-   int offset;
-   } both;
-};
 
 /*
  * Priority Inheritance state:
@@ -175,7 +150,7 @@ static inline int match_futex(union fute
  *
  * Should be called with >mm->mmap_sem but NOT any spinlocks.
  */
-static int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, union futex_key *key)
 {
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -246,6 +221,7 @@ static int get_futex_key(u32 __user *uad
}
return err;
 }
+EXPORT_SYMBOL_GPL(get_futex_key);
 
 /*
  * Take a reference to the resource addressed by a key.
@@ -254,7 +230,7 @@ static int get_futex_key(u32 __user *uad
  * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
  * function, if it is called at all.  mmap_sem keeps key->shared.inode valid.
  */
-static inline void get_key_refs(union futex_key *key)
+inline void get_futex_key_refs(union futex_key *key)
 {
if (key->both.ptr != 0) {
if (key->both.offset & 1)
@@ -263,12 +239,13 @@ static inline void get_key_refs(union fu
atomic_inc(>private.mm->mm_count);
}
 }
+EXPORT_SYMBOL_GPL(get_futex_key_refs);
 
 /*
  * Drop a reference to the resource addressed by a key.
  * The hash bucket spinlock must not be held.
  */
-static void drop_key_refs(union futex_key *key)
+void drop_futex_key_refs(union futex_key *key)
 {
if (key->both.ptr != 0) {
if (key->both.offset & 1)
@@ -277,6 +254,7 @@ static void drop_key_refs(union futex_ke
mmdrop(key->private.mm);
}
 }
+EXPORT_SYMBOL_GPL(drop_futex_key_refs);
 
 static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
@@ -871,7 +849,7 @@ static int futex_requeue(u32 __user *uad
this->lock_ptr = >lock;
}
this->key = key2;
-   get_key_refs();
+   get_futex_key_refs();
drop_count++;
 
if (ret - nr_wake >= nr_requeue)
@@ -884,9 +862,9 @@ out_unlock:
if (hb1 != hb2)
spin_unlock(>lock);
 
-   /* drop_key_refs() must be called outside the spinlocks. */
+   /* drop_futex_key_refs() must be called outside the spinlocks. */
while (--drop_count >= 0)
-   

[PATCH 1/2] lguest preparation: EXPORT_SYMBOL_GPL 5 functions

2007-02-11 Thread Rusty Russell
lguest does some fairly lowlevel things to support a host, which
normal modules don't need:

math_state_restore:
When the guest triggers a Device Not Available fault, we need
to be able to restore the FPU

__put_task_struct:
We need to hold a reference to another task for inter-guest
I/O, and put_task_struct() is an inline function which calls
__put_task_struct.

access_process_vm:
We need to access another task for inter-guest I/O.

map_vm_area & __get_vm_area:
We need to map the switcher shim (ie. monitor) at 0xFFC01000.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1054,6 +1054,7 @@ asmlinkage void math_state_restore(void)
thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
tsk->fpu_counter++;
 }
+EXPORT_SYMBOL_GPL(math_state_restore);
 
 #ifndef CONFIG_MATH_EMULATION
 
===
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -126,6 +126,7 @@ void __put_task_struct(struct task_struc
if (!profile_handoff_task(tsk))
free_task(tsk);
 }
+EXPORT_SYMBOL_GPL(__put_task_struct);
 
 void __init fork_init(unsigned long mempages)
 {
===
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2692,3 +2692,4 @@ int access_process_vm(struct task_struct
 
return buf - old_buf;
 }
+EXPORT_SYMBOL_GPL(access_process_vm);
===
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -159,6 +159,7 @@ int map_vm_area(struct vm_struct *area, 
flush_cache_vmap((unsigned long) area->addr, end);
return err;
 }
+EXPORT_SYMBOL_GPL(map_vm_area);
 
 static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long 
flags,
unsigned long start, unsigned long 
end,
@@ -237,6 +238,7 @@ struct vm_struct *__get_vm_area(unsigned
 {
return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
 }
+EXPORT_SYMBOL_GPL(__get_vm_area);
 
 /**
  * get_vm_area  -  reserve a contingous kernel virtual area


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 7/7] cleanup: make disable_acpi() valid w/o CONFIG_ACPI

2007-02-11 Thread Rusty Russell
Len Brown <[EMAIL PROTECTED]> said:
> Okay, but better to use disable_acpi()
> indeed, since this would be the first code not already inside CONFIG_ACPI
> to invoke disable_acpi(), we could define the inline as empty and you could
> then scratch the #ifdef too.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/include/asm-i386/acpi.h
+++ b/include/asm-i386/acpi.h
@@ -127,6 +127,7 @@ extern int acpi_irq_balance_set(char *st
 #define acpi_ioapic 0
 static inline void acpi_noirq_set(void) { }
 static inline void acpi_disable_pci(void) { }
+static inline void disable_acpi(void) { }
 
 #endif /* !CONFIG_ACPI */
 


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/7] cleanup: Rename cpu_gdt_descr and remove extern declaration from smpboot.c

2007-02-11 Thread Rusty Russell
When I implemented the DECLARE_PER_CPU(var) macros, I was careful that
people couldn't use "var" in a non-percpu context, by prepending
percpu__.  I never considered that this would allow them to overload
the same name for a per-cpu and a non-percpu variable.

It is only one of many horrors in the i386 boot code, but let's rename
the non-perpcu cpu_gdt_descr to early_gdt_descr (not boot_gdt_descr,
that's something else...)

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -309,7 +309,7 @@ 2:  movl %cr0,%eax
 
call check_x87
call setup_pda
-   lgdt cpu_gdt_descr
+   lgdt early_gdt_descr
lidt idt_descr
ljmp $(__KERNEL_CS),$1f
 1: movl $(__KERNEL_DS),%eax# reload all the segment registers
@@ -365,7 +365,7 @@ setup_pda:
movl start_pda, %eax
 
/* slot the PDA address into the GDT */
-   mov cpu_gdt_descr+2, %ecx
+   mov early_gdt_descr+2, %ecx
mov %ax, (__KERNEL_PDA+0+2)(%ecx)   /* base & 0x */
shr $16, %eax
mov %al, (__KERNEL_PDA+4+0)(%ecx)   /* base & 0x00ff */
@@ -588,7 +588,7 @@ idt_descr:
 
 # boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
-ENTRY(cpu_gdt_descr)
+ENTRY(early_gdt_descr)
.word GDT_ENTRIES*8-1
.long cpu_gdt_table
 
===
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -619,7 +619,6 @@ extern struct {
unsigned short ss;
 } stack_start;
 extern struct i386_pda *start_pda;
-extern struct Xgt_desc_struct cpu_gdt_descr;
 
 #ifdef CONFIG_NUMA
 
===
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -22,7 +22,7 @@ struct Xgt_desc_struct {
 
 extern struct Xgt_desc_struct idt_descr;
 DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
-
+extern struct Xgt_desc_struct early_gdt_descr;
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 6/7] cleanup: Remove extern declaration from mm/discontig.c, put in header.

2007-02-11 Thread Rusty Russell
Extern declarations belong in headers.  Times, they are a'changin.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -101,7 +101,6 @@ extern void add_one_highpage_init(struct
 extern void add_one_highpage_init(struct page *, int, int);
 
 extern struct e820map e820;
-extern unsigned long init_pg_tables_end;
 extern unsigned long highend_pfn, highstart_pfn;
 extern unsigned long max_low_pfn;
 extern unsigned long totalram_pages;
===
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -77,6 +77,8 @@ void __init add_memory_region(unsigned l
 void __init add_memory_region(unsigned long long start,
  unsigned long long size, int type);
 
+extern unsigned long init_pg_tables_end;
+
 #endif /* __ASSEMBLY__ */
 
 #endif  /*  __KERNEL__  */


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/7] cleanup: Move mce_disabled to asm/mce.h

2007-02-11 Thread Rusty Russell
Allows external actors to disable mce.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/arch/i386/kernel/cpu/mcheck/mce.h
+++ b/arch/i386/kernel/cpu/mcheck/mce.h
@@ -1,4 +1,5 @@
 #include 
+#include 
 
 void amd_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
@@ -9,6 +10,5 @@ void winchip_mcheck_init(struct cpuinfo_
 /* Call the installed machine check handler for this CPU setup. */
 extern fastcall void (*machine_check_vector)(struct pt_regs *, long 
error_code);
 
-extern int mce_disabled;
 extern int nr_mce_banks;
 
===
--- a/include/asm-i386/mce.h
+++ b/include/asm-i386/mce.h
@@ -3,3 +3,5 @@ extern void mcheck_init(struct cpuinfo_x
 #else
 #define mcheck_init(c) do {} while(0)
 #endif
+
+extern int mce_disabled;


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/7] cleanup: Make hvc_console.c compile on non-PowerPC

2007-02-11 Thread Rusty Russell
There's a really nice console helper (esp. for virtual console
drivers) in drivers/char/hvc_console.c.  It has only ever been used
for PowerPC, though, so it uses NO_IRQ which is only defined there.

Let's fix that so it's more widely useful.  By, say, lguest.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -48,6 +48,10 @@
 #define HVC_MINOR  0
 
 #define TIMEOUT(10)
+
+#ifndef NO_IRQ
+#define NO_IRQ 0
+#endif
 
 /*
  * Wait this long per iteration while trying to push buffered data to the


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/7] cleanup: Initialize esp0 properly all the time

2007-02-11 Thread Rusty Russell
Whenever we schedule, __switch_to calls load_esp0 which does:

tss->esp0 = thread->esp0;

This is never initialized for the initial thread (ie "swapper"), so
when we're scheduling that, we end up setting esp0 to 0.  This is
fine: the swapper never leaves ring 0, so this field is never used.

lguest, however, gets upset that we're trying to used an unmapped page
as our kernel stack.  Rather than work around it there, let's
initialize it.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

===
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -421,6 +421,7 @@ struct thread_struct {
 };
 
 #define INIT_THREAD  { \
+   .esp0 = sizeof(init_stack) + (long)_stack, \
.vm86_info = NULL,  \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL,  \


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/7] cleanup: paravirt unhandled fallthrough

2007-02-11 Thread Rusty Russell
The current code simply calls "start_kernel" directly if we're under a
hypervisor and no paravirt_ops backend wants us, because paravirt.c
registers that as a backend.

This was always a vain hope; start_kernel won't get far without setup.
It's also impossible for paravirt_ops backends which don't sit in the
arch/i386/kernel directory: they can't link before paravirt.o anyway.

Keep it simple: if we pass all the registered paravirt probes, BUG().

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

diff -r 4fb7fc327801 arch/i386/kernel/Makefile
--- a/arch/i386/kernel/Makefile Mon Feb 12 12:37:01 2007 +1100
+++ b/arch/i386/kernel/Makefile Mon Feb 12 12:55:00 2007 +1100
@@ -41,8 +41,6 @@ obj-$(CONFIG_KWATCH)  += debugreg.o kwat
 obj-$(CONFIG_KWATCH)   += debugreg.o kwatch.o
 
 obj-$(CONFIG_VMI)  += vmi.o vmitime.o
-
-# Make sure this is linked after any other paravirt_ops structs: see head.S
 obj-$(CONFIG_PARAVIRT) += paravirt.o
 
 EXTRA_AFLAGS   := -traditional
diff -r 4fb7fc327801 arch/i386/kernel/head.S
--- a/arch/i386/kernel/head.S   Mon Feb 12 12:37:01 2007 +1100
+++ b/arch/i386/kernel/head.S   Mon Feb 12 12:54:19 2007 +1100
@@ -513,10 +513,11 @@ startup_paravirt:
pushl   %ecx
pushl   %eax
 
-   /* paravirt.o is last in link, and that probe fn never returns */
pushl   $__start_paravirtprobe
 1:
movl0(%esp), %eax
+   cmpl$__stop_paravirtprobe, %eax
+   je  unhandled_paravirt
pushl   (%eax)
movl8(%esp), %eax
call*(%esp)
@@ -528,6 +529,10 @@ 1:
 
addl$4, (%esp)
jmp 1b
+
+unhandled_paravirt:
+   /* Nothing wanted us: we're screwed. */ 
+   ud2
 #endif
 
 /*
diff -r 4fb7fc327801 arch/i386/kernel/paravirt.c
--- a/arch/i386/kernel/paravirt.c   Mon Feb 12 12:37:01 2007 +1100
+++ b/arch/i386/kernel/paravirt.c   Mon Feb 12 12:54:19 2007 +1100
@@ -481,9 +481,6 @@ static int __init print_banner(void)
return 0;
 }
 core_initcall(print_banner);
-
-/* We simply declare start_kernel to be the paravirt probe of last resort. */
-paravirt_probe(start_kernel);
 
 struct paravirt_ops paravirt_ops = {
.name = "bare hardware",


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


libata FUA revisited

2007-02-11 Thread Robert Hancock
I've been looking at some list archives from about a year ago when there 
was a big hoohah about FUA in libata. To summarize what I've gotten from 
that discussion:


Nicolas Mailhot ran into a problem with the first kernels that supported 
libata FUA, using a Silicon Image 3114 controller and a Maxtor 6L300S0 
   drive with BANC1G10 firmware. Essentially it would quickly corrupt 
the filesystem on bootup. After that:


-A blacklist entry was added into libata disabling FUA on Maxtor drives 
with BANC1G10 firmware


-Eric Mudama from Maxtor complained that there was nothing wrong with 
FUA on those drives and the blacklist should be taken out (though it 
never was)


-It was also confirmed by Eric and others that Silicon Image 311x 
controllers go nuts if they're issued WRITE DMA FUA commands, at least 
without some driver improvements which I assume haven't happened.


-Eventually FUA was disabled by default globally in libata.

Given the above, what I'm proposing to do is:

-Remove the blacklisting of Maxtor BANC1G10 firmware for FUA. If we need 
to FUA-blacklist any drives this should likely be added to the existing 
"horkage" mechanism we now have. However, at this point I don't think 
that's needed, considering that I've seen no conclusive evidence that 
any drive has ever been established to have broken FUA.


-Add a new port flag ATA_FLAG_NO_FUA to indicate that a controller can't 
handle FUA commands, and add that flag to sata_sil. Force FUA off on any 
drive connected to a controller with this bit set.


There was some talk that sata_mv might have this problem, but I believe 
the conclusion was that it didn't. The only controllers that would are 
ones that actually try to interpret the ATA command codes and don't know 
about WRITE DMA FUA.


-Change the fua module option to control FUA enable/disable to have a 
third value, "enable for NCQ-supporting drives only", which would become 
the new default. That case seems less likely to cause problems since FUA 
on NCQ is just another bit in the command whereas FUA on non-NCQ is an 
entirely different, potentially unsupported command.


Any comments?

As an aside, I came across a comment that the Silicon Image Windows 
drivers for NCQ-supporting controllers have some blacklist entries for 
drives with broken NCQ in their .inf files. We only seem to have one in 
the libata NCQ blacklist, we may want to add some more of these. The 
ones in the current SiI3124 and 3132 drivers' .inf files for 
"DisableSataQueueing" appear to be:


Model   Firmware
Maxtor 7B250S0  BANC1B70
HTS541060G9SA00 MB3OC60D
HTS541080G9SA00 MB4OC60D
HTS541010G9SA00 MBZOC60D

(the latter 3 being Hitachi Travelstar drives)

--
Robert Hancock  Saskatoon, SK, Canada
To email, remove "nospam" from [EMAIL PROTECTED]
Home Page: http://www.roberthancock.com/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: genirq: Add a set_irq_handler_locked() function

2007-02-11 Thread David Gibson
On Fri, Feb 09, 2007 at 07:36:40AM +, Russell King wrote:
> On Fri, Feb 09, 2007 at 02:48:42PM +1100, David Gibson wrote:
> > At present set_irq_handler() and all the existing variants take the
> > desc->lock for the irq in question before adjusting the irq's flow
> > handler.  This can cause problems for irq chips for which a given
> > interrupt can be either level or edge depending on what's attached.
> 
> Are you sure you need to change the flow handler depending on how
> you program the device?
> 
> Since the outset of this design, I've had what are essentially edge
> based interrupt sources using the "level" handlers because they haven't
> had a "broken" edge implementation.  By that, I mean that the masking
> is done in such a way that you miss edges when the source is masked.
> 
> If you do not miss edges while the source is masked, there's no point
> in having the complexity of the "edge" based handler in the path - it
> buys you nothing.  Just use the "level" handler instead.

I see... how terribly obvious.

As far as I know, the 4xx UIC does things correctly, though I don't
have handy any devices with edge interrupts to test it with.

It would still be nice to have this change, so we can use the
lazy-masking from handle_edge_irq(), but I guess I can do without it
for now.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [git patches] libata updates 1 of 3

2007-02-11 Thread Markus Trippelsdorf
On Sun, Feb 11, 2007 at 05:08:17PM -0500, Tejun Heo wrote:
> Markus Trippelsdorf wrote:
> >This update breaks sata_via on my VIA K8T800Pro machine:
> > 
> > sata_via :00:0f.0 : failed to iomap PCI BAR 0
> > sata_via :00:0f.0 : out of memory
> > sata_via probe of :00:0f.0 failed with error -12
> 
> Please post full dmesg and the result of 'lspci -nnvvvxxx'.
> 
I attach the output of both commands from the last working kernel.
Because my root directory lies on the sata drive I get a kernel panic
otherwise. I have no possibility to do remote debugging at the
moment.
-- 
Markus
00:00.0 Host bridge [0600]: VIA Technologies, Inc. K8T800Pro Host Bridge 
[1106:0282]
Subsystem: ASUSTeK Computer Inc. A8V Deluxe [1043:80a3]
Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- 
Stepping- SERR+ FastB2B-
Status: Cap+ 66MHz+ UDF- FastB2B- ParErr- DEVSEL=medium >TAbort- 
SERR-  (32-bit, prefetchable)
Capabilities: [80] AGP version 3.0
Status: RQ=32 Iso- ArqSz=0 Cal=2 SBA+ ITACoh- GART64- HTrans- 
64bit- FW- AGP3+ Rate=x4,x8
Command: RQ=1 ArqSz=0 Cal=0 SBA+ AGP+ GART64- 64bit- FW- Rate=x8
Capabilities: [50] Power Management version 2
Flags: PMEClk- DSI- D1- D2- AuxCurrent=0mA 
PME(D0-,D1-,D2-,D3hot-,D3cold-)
Status: D0 PME-Enable- DSel=0 DScale=0 PME-
Capabilities: [60] HyperTransport: Slave or Primary Interface
!!! Possibly incomplete decoding
Command: BaseUnitID=0 UnitCnt=3 MastHost- DefDir-
Link Control 0: CFlE- CST- CFE- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
Reset- FastB2B-
Capabilities: [80] Power Management version 2
Flags: PMEClk- DSI- D1+ D2- AuxCurrent=0mA 
PME(D0-,D1-,D2-,D3hot-,D3cold-)
Status: D0 PME-Enable- DSel=0 DScale=0 PME-
00: 06 11 88 b1 07 01 30 02 00 00 04 06 00 00 01 00
10: 00 00 00 00 00 00 00 00 00 01 01 00 e0 e0 20 02
20: d0 fb f0 fb 00 e8 f0 fa 00 00 00 00 00 00 00 00
30: 00 00 00 00 80 00 00 00 00 00 00 00 00 00 0a 00
40: 91 40 80 44 35 3a 88 b1 00 00 00 00 00 00 00 00
50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
80: 01 00 02 02 00 00 00 00 00 00 00 00 00 00 00 00
90: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
c0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
d0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

00:0c.0 Multimedia video controller [0400]: Brooktree Corporation Bt878 Video 
Capture [109e:036e] (rev 11)
Subsystem: Pinnacle Systems Inc. PCTV pro (TV + FM stereo receiver) 
[11bd:0012]
Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- 
Stepping- SERR+ FastB2B-
Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- 
SERR- TAbort- SERR- TAbort- SERR- TAbort- SERR- TAbort- SERR- TAbort- 
SERR- TAbort- 
SERR- Linux version 2.6.20-g0670afdf ([EMAIL PROTECTED]) (gcc version 4.1.1 (Gentoo 
4.1.1-r3)) #56 SMP Wed Feb 7 15:10:00 CET 2007
Command line: root=/dev/sda1
BIOS-provided physical RAM map:
 BIOS-e820:  - 0009fc00 (usable)
 BIOS-e820: 0009fc00 - 000a (reserved)
 BIOS-e820: 000e4000 - 0010 (reserved)
 BIOS-e820: 0010 - 7ffb (usable)
 BIOS-e820: 7ffb - 7ffc (ACPI data)
 BIOS-e820: 7ffc - 7fff (ACPI NVS)
 BIOS-e820: 7fff - 8000 (reserved)
 BIOS-e820: ff78 - 0001 (reserved)
Entering add_active_range(0, 0, 159) 0 entries of 256 used
Entering add_active_range(0, 256, 524208) 1 entries of 256 used
end_pfn_map = 1048576
DMI 2.3 present.
ACPI: RSDP (v002 ACPIAM) @ 0x000fa7c0
ACPI: XSDT (v001 A M I  OEMXSDT  0x11000514 MSFT 0x0097) @ 
0x7ffb0100
ACPI: FADT (v003 A M I  OEMFACP  0x11000514 MSFT 0x0097) @ 
0x7ffb0290
ACPI: MADT (v001 A M I  OEMAPIC  0x11000514 MSFT 0x0097) @ 
0x7ffb0390
ACPI: OEMB (v001 A M I  OEMBIOS  0x11000514 MSFT 0x0097) @ 
0x7ffc0040
ACPI: DSDT (v001  A0036 A0036001 0x0001 MSFT 0x010d) @ 
0x
Entering add_active_range(0, 0, 159) 0 entries of 256 used
Entering add_active_range(0, 256, 524208) 1 entries of 256 used
Zone PFN ranges:
  DMA 0 -> 4096
  DMA324096 ->  1048576
  Normal1048576 ->  1048576
early_node_map[2] active PFN ranges
0:0 ->  159

  1   2   3   4   5   6   7   >