[patch] sched: accurate user accounting

2007-03-24 Thread Ingo Molnar

* Con Kolivas <[EMAIL PROTECTED]> wrote:

> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?

we want to do this - and we should do this to the vanilla scheduler 
first and check the results. I've back-merged the patch to before RSDL 
and have tested it - find the patch below. Vale, could you try this 
patch against a 2.6.21-rc4-ish kernel and re-test your testcase?

Ingo

->
Subject: [patch] sched: accurate user accounting
From: Con Kolivas <[EMAIL PROTECTED]>

Currently we only do cpu accounting to userspace based on what is 
actually happening precisely on each tick. The accuracy of that 
accounting gets progressively worse the lower HZ is. As we already keep 
accounting of nanosecond resolution we can accurately track user cpu, 
nice cpu and idle cpu if we move the accounting to update_cpu_clock with 
a nanosecond cpu_usage_stat entry. This increases overhead slightly but 
avoids the problem of tick aliasing errors making accounting unreliable.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>
Signed-off-by: Ingo Molnar <[EMAIL PROTECTED]>
---
 include/linux/kernel_stat.h |3 ++
 include/linux/sched.h   |2 -
 kernel/sched.c  |   52 +---
 kernel/timer.c  |5 +---
 4 files changed, 55 insertions(+), 7 deletions(-)

Index: linux/include/linux/kernel_stat.h
===
--- linux.orig/include/linux/kernel_stat.h
+++ linux/include/linux/kernel_stat.h
@@ -16,11 +16,14 @@
 
 struct cpu_usage_stat {
cputime64_t user;
+   cputime64_t user_ns;
cputime64_t nice;
+   cputime64_t nice_ns;
cputime64_t system;
cputime64_t softirq;
cputime64_t irq;
cputime64_t idle;
+   cputime64_t idle_ns;
cputime64_t iowait;
cputime64_t steal;
 };
Index: linux/include/linux/sched.h
===
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -882,7 +882,7 @@ struct task_struct {
int __user *clear_child_tid;/* CLONE_CHILD_CLEARTID */
 
unsigned long rt_priority;
-   cputime_t utime, stime;
+   cputime_t utime, utime_ns, stime;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or 
thread-specific */
Index: linux/kernel/sched.c
===
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -167,6 +167,12 @@ unsigned long long __attribute__((weak))
(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 
+/*
+ * Some helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)   ((TIME) / (10 / HZ))
+#define JIFFIES_TO_NS(TIME)   ((TIME) * (10 / HZ))
+
 #define TASK_PREEMPTS_CURR(p, rq) \
((p)->prio < (rq)->curr->prio)
 
@@ -3017,8 +3023,50 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-   p->sched_time += now - p->last_ran;
+   struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+   cputime64_t time_diff = now - p->last_ran;
+
+   p->sched_time += time_diff;
p->last_ran = rq->most_recent_timestamp = now;
+   if (p != rq->idle) {
+   cputime_t utime_diff = time_diff;
+
+   if (TASK_NICE(p) > 0) {
+   cpustat->nice_ns = cputime64_add(cpustat->nice_ns,
+time_diff);
+   if (NS_TO_JIFFIES(cpustat->nice_ns) > 1) {
+   cpustat->nice_ns =
+   cputime64_sub(cpustat->nice_ns,
+   JIFFIES_TO_NS(1));
+   cpustat->nice =
+   cputime64_add(cpustat->nice, 1);
+   }
+   } else {
+   cpustat->user_ns = cputime64_add(cpustat->user_ns,
+   time_diff);
+   if (NS_TO_JIFFIES(cpustat->user_ns) > 1) {
+   cpustat->user_ns =
+   cputime64_sub(cpustat->user_ns,
+   JIFFIES_TO_NS(1));
+   cpustat ->user =
+   cputime64_add(cpustat->user, 1);
+   }
+   }
+   p->utime_ns = cputime_add(p->utime_ns, utime_diff);
+   if (NS_TO_JIFFIES(p->utime_ns) > 1) {
+   p->utime_ns = cputime_sub(p->utime_ns,
+ JIFFIES_TO_NS(1));

Re: Early hang with 2.6.21-rc4-rt1

2007-03-24 Thread Ingo Molnar

* Ingo Molnar <[EMAIL PROTECTED]> wrote:

> > OK, another data point.  The config below boots and works with 
> > 2.6.21-rc4-rt1, but enabling CONFIG_CRITICAL_IRQSOFF_TIMING causes the 
> > early boot hang.
>
> ah, i havent tried that option in quite some time, so bitrot is pretty 
> likely. Does the problem go away if you disable CONFIG_FUNCTION_TRACE? 

hm - on 32-bit, CRITICAL_IRQSOFF_TIMING+FUNCTION_TRACING works fine for 
me. I'll try the 64-bit kernel too.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [-mm patch] lguest: #if 0 check_bug_kill()

2007-03-24 Thread Rusty Russell
On Sat, 2007-03-24 at 14:06 +0100, Adrian Bunk wrote:
> On Mon, Mar 19, 2007 at 08:56:23PM -0800, Andrew Morton wrote:
> >...
> > Changes since 2.6.21-rc3-mm1:
> >...
> > +lguest-use-read-only-pages-rather-than-segments-to-protect-high-mapped-switcher.patch
> >...
> >  x86/x86_64 updates
> >...
> 
> 
> check_bug_kill() is no longer used.

Thanks Adrian, that was actually an oversight.  However, this function
is most useful in early bringup, so I didn't notice it was gone.

I'd prefer a patch which eliminates it altogether, rather than #if 0 it
out.

Thanks!
Rusty.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Early hang with 2.6.21-rc4-rt1

2007-03-24 Thread Ingo Molnar

* Roland Dreier <[EMAIL PROTECTED]> wrote:

> I'm trying to use 2.6.21-rc4-rt1 to track down who's keeping 
> interrupts off for too long. [...]

btw., is this something you know for sure (if yes, how do you know?) - 
or is it that you would like to double-check the irqs-off times of 
v2.6.21-to-be?

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] USB Elan FTDI: check for driver registration status

2007-03-24 Thread Cyrill Gorcunov
This patch adds checking of driver registration status
and if it fails release allocated resources.

Signed-off-by: Cyrill Gorcunov <[EMAIL PROTECTED]>

---

Pete, please review the patch and Ack it then.

 drivers/usb/misc/ftdi-elan.c |   37 +++--
 1 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/drivers/usb/misc/ftdi-elan.c b/drivers/usb/misc/ftdi-elan.c
index bc3327e..3cd9ae3 100644
--- a/drivers/usb/misc/ftdi-elan.c
+++ b/drivers/usb/misc/ftdi-elan.c
@@ -2909,27 +2909,36 @@ static int __init ftdi_elan_init(void)
 init_MUTEX(&ftdi_module_lock);
 INIT_LIST_HEAD(&ftdi_static_list);
 status_queue = create_singlethread_workqueue("ftdi-status-control");
-   if (!status_queue)
-   goto err1;
 command_queue = create_singlethread_workqueue("ftdi-command-engine");
-   if (!command_queue)
-   goto err2;
 respond_queue = create_singlethread_workqueue("ftdi-respond-engine");
-   if (!respond_queue)
-   goto err3;
+   if (!status_queue || !command_queue || !respond_queue) {
+   printk(KERN_ERR "%s couldn't create workqueue\n",
+  ftdi_elan_driver.name);
+   result = -ENOMEM;
+   goto err;
+   }
 result = usb_register(&ftdi_elan_driver);
-if (result)
+if (result) {
 printk(KERN_ERR "usb_register failed. Error number %d\n",
   result);
+   goto err;
+   }
 return result;
 
- err3:
-   destroy_workqueue(command_queue);
- err2:
-   destroy_workqueue(status_queue);
- err1:
-   printk(KERN_ERR "%s couldn't create workqueue\n", 
ftdi_elan_driver.name);
-   return -ENOMEM;
+ err:
+   if (status_queue) {
+   destroy_workqueue(status_queue);
+   status_queue = NULL;
+   }
+   if (command_queue) {
+   destroy_workqueue(command_queue);
+   command_queue = NULL;
+   }
+   if (respond_queue) {
+   destroy_workqueue(respond_queue);
+   respond_queue = NULL;
+   }
+   return result;
 }
 
 static void __exit ftdi_elan_exit(void)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [4/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread Thomas Gleixner
On Sun, 2007-03-25 at 09:11 +0200, Michael S. Tsirkin wrote:
> > I lost track of Michaels various nested problems.
> > 
> > Michael can you please give a summary on _all_ entries in the
> > regressions list against Linus latest ?
> 
> I tested 2 different configurations on my T60:
> - With CONFIG_NO_HZ enabled.
>   I tested this on -rc1, and have not retested with CONFIG_NO_HZ since.
>   Observed behaviour: the system would not come out of suspend to RAM.
>   After I press Fn/F4 the crescent LED starts blinking so it seems Linux 
> started
>   doing something.
>   This is a problem but not a regression as such, since CONFIG_NO_HZ is new
>   in 2.6.21.

It needs to be fixed before 2.6.21 final nevertheless.

> - Without CONFIG_NO_HZ
>   I last tested this with cd05a1f818073a623455a58e756c5b419fc98db9.
>   After systems comes out of suspend to ram, I observed the following
>   behaviour (I used s2ram from console):
>   1. The first disk access takes much longer than with 2.6.20
>   2. System clock does not advance (date always reports the same time)
>   3. After an attempt to switch to X, X starts drawing some windows and then 
> hangs
> 
>   All 3 issues are new and did not occur under 2.6.20, so this is a 
> regression.
>   Attached is a full dmesg from boot to resume.

There is not much interesting to see in the log.

Can you please test the following:

Add "clocksource=acpi_pm" to the kernel commandline.

If this does not change anything, then disable CONFIG_HPET and retry.


One thing in the log is indeed scary:

[2.959150] Calibrating delay using timer specific routine.. 20089.12
BogoMIPS (lpj=100445639)

This is after the reboot, but it is not related to your problem. This is
a different problem, which needs urgent attention.

Adrian, can you open a seperate entry for this please ? It is not a new
thing, this can be observed with older kernels as well, but it needs to
be addressed. It probably needs a similar solution as I did for the
local apic timer calibration.

tglx


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Early hang with 2.6.21-rc4-rt1

2007-03-24 Thread Ingo Molnar

* Roland Dreier <[EMAIL PROTECTED]> wrote:

> OK, another data point.  The config below boots and works with 
> 2.6.21-rc4-rt1, but enabling CONFIG_CRITICAL_IRQSOFF_TIMING causes the 
> early boot hang.
> 
> Any idea?

ah, i havent tried that option in quite some time, so bitrot is pretty 
likely. Does the problem go away if you disable CONFIG_FUNCTION_TRACE? 
If the problem goes away then it would suggest some sort of 
mcount()-driven recursion into hardirqs_off/hardirqs_on.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: (usagi-core 32638) Re: [linux-usb-devel] [PATCH 0/2] [SERIAL] [USB] fixed to skip NULL entry in struct serial usb_serial_port.

2007-03-24 Thread Noriaki TAKAMIYA
Hi,

>> Sat, 24 Mar 2007 14:22:53 -0700
>> [Subject: (usagi-core 32638) Re: [linux-usb-devel] [PATCH 0/2] [SERIAL] 
>> [USB] fixed to skip NULL entry in struct serial usb_serial_port.]
>> Greg KH <[EMAIL PROTECTED]> wrote...

> This should already be fixed in the -git snapshots that have come out
> after 2.6.21-rc4.  Can you test them to verify this?

  Yes, this problem was already fixed.

  Thanks.

--
Noriaki TAKAMIYA
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ckrm-tech] [PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code

2007-03-24 Thread Paul Jackson
vatsa wrote:
> Not just this, continuing further we have more trouble:
> 
> 
> CPU0 (attach_task T1 to CS2)  CPU1 (T1 is exiting)
> 
> 
> synchronize_rcu()
>   atomic_dec(&CS1->count);
>   [CS1->count = 0]
> 
> if atomic_dec_and_test(&oldcs->count))
>   [CS1->count = -1]
> ...
> 2nd race is tricky. We probably need to do this to avoid it:
> 
>   task_lock(tsk);
> 
>   /* Check if tsk->cpuset is still same. We may have raced with 
>* cpuset_exit changing tsk->cpuset again under our feet.
>*/
>   if (tsk->cpuset == cs && atomic_dec_and_test(&oldcs->count)) {

I'm unsure here, but this 'tsk->cpuset == cs' test feels fragile to me.

How about a bit earlier in attach_task(), right at the point we overwrite the
victim tasks cpuset pointer, we decrement the count on the old cpuset, and if
it went to zero, remember that we'll need to release it, once we've dropped
some locks:

static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
{
... 

struct cpuset *oldcs;
struct cpuset *oldcs_tobe_released;

...

task_lock(tsk);
oldcs = tsk->cpuset;
...
if (tsk->flags & PF_EXITING) {
...
}
atomic_inc(&cs->count);
rcu_assign_pointer(tsk->cpuset, cs);
oldcs_tobe_released = NULL;
if (atomic_dec_and_test(&oldcs->count))
oldcs_tobe_released = oldcs;
task_unlock(tsk);

...
put_task_struct(tsk);
synchronize_rcu();
if (oldcs_tobe_released)
check_for_release(oldcs_tobe_released, ppathbuf);
return 0;
}

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: chrdev_open lifetime question

2007-03-24 Thread Dmitry Torokhov
Hi Pete,

On Monday 19 March 2007 21:02, Pete Zaitcev wrote:
> On Wed, 7 Mar 2007 17:23:05 -0500, "Dmitry Torokhov" <[EMAIL PROTECTED]> 
> wrote:
> 
> > It seems that if a process keeps a character device open then other
> > processes will also be able to get into filp->f_op->open(inode,filp)
> > in chrdev_open() even after a driver called cdev_del() as part of its
> > unwind procedure. Is this correct or am I missing something?
> 
> I see no replies in the archives. Have you got any private ones?

No I have not.

> 
> Also, what's the context?

I want to switch input handlers to cdevs to rip table of devices out
of input core and lift limit on potential number of devices and was
pondering locking implications.

-- 
Dmitry
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: keyboard.c: Stop flooding dmesg with useless warnings

2007-03-24 Thread Parag Warudkar

On 3/25/07, Dmitry Torokhov <[EMAIL PROTECTED]> wrote:

On Sunday 25 March 2007 01:27, Parag Warudkar wrote:
> >
> > Actually the keyboard driver should not emit input events for that key code.
> > Is this a USB keyboard?
> >
> > --
> > Dmitry
> >
>
> Yes this is a USB keyboard.
>
> Any hint as to where I should start looking to make the driver not
> emit input event for keycode==0?
>

Was it always doing that? I'll add Jiri Kosina to the CC list as he's
involved with HID now.



Something (some keypress?) triggers it (by default it doesn't happen
immediately after a boot) - but once it starts it doesn't seem to stop
printing.

Parag
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Early hang with 2.6.21-rc4-rt1

2007-03-24 Thread Roland Dreier
OK, another data point.  The config below boots and works with
2.6.21-rc4-rt1, but enabling CONFIG_CRITICAL_IRQSOFF_TIMING causes the
early boot hang.

Any idea?

Thanks,
  Roland

#
# Automatically generated make config: don't edit
# Linux kernel version: 2.6.21-rc4-rt1
# Sat Mar 24 22:06:44 2007
#
CONFIG_X86_64=y
CONFIG_64BIT=y
CONFIG_X86=y
CONFIG_PARAVIRT=y
CONFIG_GENERIC_TIME=y
CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
CONFIG_GENERIC_CLOCKEVENTS=y
CONFIG_GENERIC_TIME_VSYSCALL=y
CONFIG_ZONE_DMA32=y
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_SEMAPHORE_SLEEPERS=y
CONFIG_MMU=y
CONFIG_ZONE_DMA=y
CONFIG_GENERIC_HWEIGHT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_X86_CMPXCHG=y
CONFIG_EARLY_PRINTK=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_IOMAP=y
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
CONFIG_ARCH_POPULATES_NODE_MAP=y
CONFIG_DMI=y
CONFIG_AUDIT_ARCH=y
CONFIG_GENERIC_BUG=y
# CONFIG_ARCH_HAS_ILOG2_U32 is not set
# CONFIG_ARCH_HAS_ILOG2_U64 is not set
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"

#
# Code maturity level options
#
CONFIG_EXPERIMENTAL=y
CONFIG_LOCK_KERNEL=y
CONFIG_INIT_ENV_ARG_LIMIT=32

#
# General setup
#
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
# CONFIG_IPC_NS is not set
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_BSD_PROCESS_ACCT_V3=y
# CONFIG_TASKSTATS is not set
# CONFIG_UTS_NS is not set
CONFIG_AUDIT=y
# CONFIG_AUDITSYSCALL is not set
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_CPUSETS=y
# CONFIG_SYSFS_DEPRECATED is not set
# CONFIG_RELAY is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
# CONFIG_EMBEDDED is not set
CONFIG_UID16=y
CONFIG_SYSCTL_SYSCALL=y
CONFIG_KALLSYMS=y
# CONFIG_KALLSYMS_ALL is not set
# CONFIG_KALLSYMS_EXTRA_PASS is not set
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
CONFIG_BUG=y
CONFIG_ELF_CORE=y
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
CONFIG_EPOLL=y
CONFIG_SHMEM=y
CONFIG_SLAB=y
CONFIG_VM_EVENT_COUNTERS=y
CONFIG_RT_MUTEXES=y
# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0
# CONFIG_SLOB is not set

#
# Loadable module support
#
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
CONFIG_MODVERSIONS=y
# CONFIG_MODULE_SRCVERSION_ALL is not set
CONFIG_KMOD=y
CONFIG_STOP_MACHINE=y

#
# Block layer
#
CONFIG_BLOCK=y
# CONFIG_BLK_DEV_IO_TRACE is not set

#
# IO Schedulers
#
CONFIG_IOSCHED_NOOP=y
CONFIG_IOSCHED_AS=y
CONFIG_IOSCHED_DEADLINE=y
CONFIG_IOSCHED_CFQ=y
# CONFIG_DEFAULT_AS is not set
# CONFIG_DEFAULT_DEADLINE is not set
CONFIG_DEFAULT_CFQ=y
# CONFIG_DEFAULT_NOOP is not set
CONFIG_DEFAULT_IOSCHED="cfq"

#
# Processor type and features
#
CONFIG_X86_PC=y
# CONFIG_X86_VSMP is not set
CONFIG_MK8=y
# CONFIG_MPSC is not set
# CONFIG_MCORE2 is not set
# CONFIG_GENERIC_CPU is not set
CONFIG_X86_L1_CACHE_BYTES=64
CONFIG_X86_L1_CACHE_SHIFT=6
CONFIG_X86_INTERNODE_CACHE_BYTES=64
CONFIG_X86_TSC=y
CONFIG_X86_GOOD_APIC=y
CONFIG_MICROCODE=m
CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=m
CONFIG_X86_CPUID=m
CONFIG_X86_IO_APIC=y
CONFIG_X86_LOCAL_APIC=y
CONFIG_MTRR=y
CONFIG_SMP=y
CONFIG_SCHED_SMT=y
CONFIG_SCHED_MC=y
# CONFIG_TICK_ONESHOT is not set
# CONFIG_NO_HZ is not set
# CONFIG_HIGH_RES_TIMERS is not set
CONFIG_PREEMPT_NONE=y
# CONFIG_PREEMPT_VOLUNTARY is not set
# CONFIG_PREEMPT_DESKTOP is not set
# CONFIG_PREEMPT_RT is not set
# CONFIG_PREEMPT_SOFTIRQS is not set
# CONFIG_PREEMPT_HARDIRQS is not set
# CONFIG_SPINLOCK_BKL is not set
# CONFIG_PREEMPT_BKL is not set
CONFIG_CLASSIC_RCU=y
# CONFIG_PREEMPT_RCU is not set
# CONFIG_RCU_TRACE is not set
CONFIG_NUMA=y
CONFIG_RWSEM_GENERIC_SPINLOCK=y
CONFIG_K8_NUMA=y
CONFIG_NODES_SHIFT=6
CONFIG_X86_64_ACPI_NUMA=y
# CONFIG_NUMA_EMU is not set
CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y
CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_SELECT_MEMORY_MODEL=y
# CONFIG_FLATMEM_MANUAL is not set
CONFIG_DISCONTIGMEM_MANUAL=y
# CONFIG_SPARSEMEM_MANUAL is not set
CONFIG_DISCONTIGMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_NEED_MULTIPLE_NODES=y
# CONFIG_SPARSEMEM_STATIC is not set
# CONFIG_MEMORY_HOTPLUG is not set
CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_MIGRATION=y
CONFIG_RESOURCES_64BIT=y
CONFIG_ZONE_DMA_FLAG=1
CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
CONFIG_NR_CPUS=32
# CONFIG_HOTPLUG_CPU is not set
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_IOMMU=y
# CONFIG_CALGARY_IOMMU is not set
CONFIG_SWIOTLB=y
CONFIG_X86_MCE=y
# CONFIG_X86_MCE_INTEL is not set
CONFIG_X86_MCE_AMD=y
CONFIG_KEXEC=y
# CONFIG_CRASH_DUMP is not set
CONFIG_PHYSICAL_START=0x20
# CONFIG_SECCOMP is not set
# CONFIG_CC_STACKPROTECTOR is not set
# CONFIG_HZ_100 is not set
CONFIG_HZ_250=y
# CONFIG_HZ_300 is not set
# CONFIG_HZ_1000 is not set
CONFIG_HZ=250
CONFIG_REORDER=y
CONFIG_K8_NB=y
CONFIG_GENERIC_HARDIRQS=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_ISA_DMA_API=y
CONFIG_GENERIC_PENDING_IRQ=y

#
# Power management options
#
CONFIG_PM=y
# CONFIG_PM_LEGA

Re: keyboard.c: Stop flooding dmesg with useless warnings

2007-03-24 Thread Dmitry Torokhov
On Sunday 25 March 2007 01:27, Parag Warudkar wrote:
> >
> > Actually the keyboard driver should not emit input events for that key code.
> > Is this a USB keyboard?
> >
> > --
> > Dmitry
> >
> 
> Yes this is a USB keyboard.
> 
> Any hint as to where I should start looking to make the driver not
> emit input event for keycode==0?
>

Was it always doing that? I'll add Jiri Kosina to the CC list as he's
involved with HID now.

-- 
Dmitry
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: keyboard.c: Stop flooding dmesg with useless warnings

2007-03-24 Thread Parag Warudkar


Actually the keyboard driver should not emit input events for that key code.
Is this a USB keyboard?

--
Dmitry



Yes this is a USB keyboard.

Any hint as to where I should start looking to make the driver not
emit input event for keycode==0?

Parag
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: keyboard.c: Stop flooding dmesg with useless warnings

2007-03-24 Thread Dmitry Torokhov
On Saturday 24 March 2007 23:38, Parag Warudkar wrote:
> 
> >
> > Signed-off-by: Parag Warudkar <[EMAIL PROTECTED]>
> >
> > --- linux-2.6-wk/drivers/char/keyboard.c2007-03-24 23:01:19.0 
> > -0400
> > +++ linux-2.6/drivers/char/keyboard.c   2007-03-24 21:43:58.0 
> > -0400
> > @@ -1161,7 +1161,7 @@
> >
> > if ((raw_mode = (kbd->kbdmode == VC_RAW)) && !hw_raw)
> > if (emulate_raw(vc, keycode, !down << 7))
> > -   if (keycode < BTN_MISC && keycode != KEY_RESERVED)
> > +   if (keycode < BTN_MISC)
> > printk(KERN_WARNING "keyboard.c: can't 
> > emulate rawmode for keycode %d\n", keycode);
> >
> > #ifdef CONFIG_MAGIC_SYSRQ  /* Handle the SysRq Hack */
> >
> 
> Yikes. Wrong one above. Right one below. Against latest git.
> 
> I use  Apple keyboard and mouse which seem to generate events with
> keycode==0.
> 
> keyboard.c floods dmesg endlessly with below messages. This happens at a
> very fast rate and never stops, leaving the dmesg unusable.
> 
> [46591.96] keyboard.c: can't emulate rawmode for keycode 0
> [46591.996000] keyboard.c: can't emulate rawmode for keycode 0
> [46592.032000] keyboard.c: can't emulate rawmode for keycode 0
> [46592.068000] keyboard.c: can't emulate rawmode for keycode 0
> [46592.104000] keyboard.c: can't emulate rawmode for keycode 0
> [46592.14] keyboard.c: can't emulate rawmode for keycode 0
> [46592.176000] keyboard.c: can't emulate rawmode for keycode 0
> [46592.212000] keyboard.c: can't emulate rawmode for keycode 0
> [46592.248000] keyboard.c: can't emulate rawmode for keycode 0
> 
> The patch below avoids printing the warning if keycode == KEY_RESERVED.
> 
> If a more correct fix is possible please let me know and I will redo this.
> (I suspect avoiding call to emulate_raw() with value ranges it cannot
> emulate might be a better fix?)

Actually the keyboard driver should not emit input events for that key code.
Is this a USB keyboard?

-- 
Dmitry
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [1/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread Paul Collins
David Miller <[EMAIL PROTECTED]> writes:

> Furthermore, his scripts don't even execute properly when
> I try to run them myself, for example server.py gives me
> this syntax error when python tries to parse the script:
>
> [EMAIL PROTECTED]:~/src/GIT/net-2.6$ /usr/bin/python server.py
>   File "server.py", line 9
> struct sockaddr_in ServerAddr;
>  ^
> SyntaxError: invalid syntax

The reporter's client and server code seem to be C, but if the he thinks
they should be Python, I guess they're not the correct ones anyway.

-- 
Paul Collins
Wellington, New Zealand

Dag vijandelijk luchtschip de huismeester is dood
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: About GCC4 Optimization

2007-03-24 Thread David Schwartz

> So what gcc does may be technically legal, but it's still a horribly
> bad thing to do. Sadly, some gcc people seem to care more
> about "letter
> of the law" than "sanity and quality of implementation".

You know, it would be one thing if they were consistent. A policy that, by
default, you get all the optimizations the relevant standards allow wouldn't
be a problem. But they do this when they feel like it, and they disable
significant optimizations even where the standards allow them when they feel
like that.

See, for example:
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=20099

"You cannot create code that works with this option and doesn't work without
it except by violating the POSIX standard. So POSIX code should not have
this option enabled by default -- it's a pure pessimization." Yet the option
is on by default when -pthreads is specified.

DS

PS: Yes, I'm still pissed about this. ;)


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ckrm-tech] [PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code

2007-03-24 Thread Paul Jackson
> I will try to send out a patch later today to fix

Thanks!

> Agreed, but good to keep code clean isn't it? :)

Definitely.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ckrm-tech] [PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code

2007-03-24 Thread Srivatsa Vaddagiri
On Sat, Mar 24, 2007 at 09:45:50PM -0700, Paul Jackson wrote:
> Nice work - thanks.  Yes, both an extra cpuset count and a negative
> cpuset count are bad news, opening the door to the usual catastrophes.
> 
> Would you like the honor of submitting the patch to add a task_lock
> to cpuset_exit()?  If you do, be sure to fix, or at least remove,
> the cpuset_exit comment lines:

I will try to send out a patch later today to fix this bug in mainline
cpuset code. I happened to notice this race with my rcfs patch and observed 
same is true with cpuset/container code also.

>  * We don't need to task_lock() this reference to tsk->cpuset,
>  * because tsk is already marked PF_EXITING, so attach_task() won't
>  * mess with it, or task is a failed fork, never visible to attach_task.

Sure, I had seen that.

> So, in real life, this would be a difficult race to trigger.

Agreed, but good to keep code clean isn't it? :)

> Thanks for finding this.

Wellcome!

-- 
Regards,
vatsa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ckrm-tech] [PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code

2007-03-24 Thread Paul Jackson
vatsa wrote:
> Now consider:

Nice work - thanks.  Yes, both an extra cpuset count and a negative
cpuset count are bad news, opening the door to the usual catastrophes.

Would you like the honor of submitting the patch to add a task_lock
to cpuset_exit()?  If you do, be sure to fix, or at least remove,
the cpuset_exit comment lines:

 * We don't need to task_lock() this reference to tsk->cpuset,
 * because tsk is already marked PF_EXITING, so attach_task() won't
 * mess with it, or task is a failed fork, never visible to attach_task.

I guess that taking task_lock() in cpuset_exit() should not be a serious
performance issue.  It's taking a spinlock that is in the current
exiting tasks task struct, so it should be a cache hot memory line and
a rarely contested lock.

And I guess I've not see this race in real life, as one side of it has
to execute quite a bit of code in the task exit path, from when it sets
PF_EXITING until it gets into the cpuset_exit() call, while the other side
does the three lines:

if (tsk->flags & PF_EXITING) ...
atomic_inc(&cs->count);
rcu_assign_pointer(tsk->cpuset, cs);

So, in real life, this would be a difficult race to trigger.

Thanks for finding this.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [1/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread David Miller
From: Adrian Bunk <[EMAIL PROTECTED]>
Date: Fri, 23 Mar 2007 19:48:17 +0100

> Subject: problem with sockets
> References : http://lkml.org/lkml/2007/3/21/248
> Submitter  : Jose Alberto Reguero <[EMAIL PROTECTED]>
> Status : unknown

Not enough information in his report, for example for the
case he says does not work he fails to indicate what kernel
or system type the Client runs on.

Furthermore, his scripts don't even execute properly when
I try to run them myself, for example server.py gives me
this syntax error when python tries to parse the script:

[EMAIL PROTECTED]:~/src/GIT/net-2.6$ /usr/bin/python server.py
  File "server.py", line 9
struct sockaddr_in ServerAddr;
 ^
SyntaxError: invalid syntax


Can someone help de-crapify this bug report?
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: About GCC4 Optimization

2007-03-24 Thread Andrew Morton
On Sun, 25 Mar 2007 03:42:25 + "yuan cooper" <[EMAIL PROTECTED]> wrote:

> Hi all:
>  
> during my work, I found there is a bug with GCC4 O2 optimization.
>  
> -
> float ftmp;
> unsigned long tmp;
> ftmp = 1.0/1024.0;
> tmp  = *(unsigned long *)(&ftmp);
> tmp  = (tmp >> 11) && 0xFFF;
> - 
>  
> if optimization level is O2, gcc will MOV eax to tmp, but current eax has a 
> random value.
> -O is ok and gcc3 with O2 is ok too.
>  
>  
> I am a kernel newbie, I don't know how to make contributions to janitors, who 
> will help me? It's my first post, any suggestion will be appreciated.

Don't use floating point in kernel code.  At all.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux-VServer example results for sharing vs. separate mappings ...

2007-03-24 Thread Andrew Morton
On Sun, 25 Mar 2007 04:21:56 +0200 Herbert Poetzl <[EMAIL PROTECTED]> wrote:

> > a) slice the machine into 128 fake NUMA nodes, use each node as the
> >basic block of memory allocation, manage the binding between these
> >memory hunks and process groups with cpusets.
> 
> 128 sounds a little small to me, considering that we
> already see 300+ Guests on older machines 
> (or am I missing something here?)

Yes, you're missing something very significant.  I'm talking about resource
management (ie: partitioning) and you're talking about virtual servers. 
They're different applications, with quite a lot in common.

For resource management, a few fives or tens of containers is probably an
upper bound.

An impementation needs to address both requirements.

> >This is what google are testing, and it works.
> > 
> > b) Create a new memory abstraction, call it the "software zone",
> >which is mostly decoupled from the present "hardware zones". Most of
> >the MM is reworked to use "software zones". The "software zones" are
> >runtime-resizeable, and obtain their pages via some means from the
> >hardware zones. A container uses a software zone.
> > 
> > c) Something else, similar to the above.  Various schemes can be
> >envisaged, it isn't terribly important for this discussion.
> 
> for me, the most natural approach is the one with 
> the least impact and smallest number of changes
> in the (granted quite complex) system: leave 
> everything as is, from the 'entire system' point
> of view, and do adjustments and decisions with the
> additional Guest/Context information in mind ...
> 
> e.g. if we decide to reclaim pages, and the 'normal'
> mechanism would end up with 100 'equal' candidates,
> the Guest badness can be a good additional criterion
> to decide which pages get thrown out ...
> 
> OTOH, the Guest status should never control the
> entire system behaviour in a way which harms the
> overall performance or resource efficiency

On the contrary - if one container exceeds its allotted resource, we want
the processes in that container to bear the majority of the cost of that. 
Ideally, all of the cost.

> 
> > All doable, if we indeed have a demonstrable problem
> > which needs to be addressed.
> 
> all in all I seem to be missing the 'original problem'
> which basically forces us to do all those things you
> describe instead of letting the Linux Memory System
> work as it works right now and just get the accounting
> right ...

The VM presently cannot satisfy resource management requirements, because
piggy activity from one job will impact the performance of all other jobs.

> > > note that the 'frowned upon' accounting Linux-VServer
> > > does seems to work for those cases quite fine .. here
> > > the relevant accounting/limits for three guests, the
> > > first two unified and started in strict sequence, the
> > > third one completely separate
> > > 
> > > Limit  current min/max  soft/hard hits
> > > VM:  41739   0/   64023   -1/  -1  0
> > > RSS:  8073   0/9222   -1/  -1  0
> > > ANON: 3110   0/3405   -1/  -1  0
> > > RMAP: 4960   0/5889   -1/  -1  0
> > > SHM:  7138   0/7138   -1/  -1  0
> > > 
> > > Limit  current min/max  soft/hard hits
> > > VM:  41738   0/   64163   -1/  -1  0
> > > RSS:  8058   0/9383   -1/  -1  0
> > > ANON: 3108   0/3505   -1/  -1  0
> > > RMAP: 4950   0/5912   -1/  -1  0
> > > SHM:  7138   0/7138   -1/  -1  0
> > > 
> > > Limit  current min/max  soft/hard hits
> > > VM:  41738   0/   63912   -1/  -1  0
> > > RSS:  8050   0/9211   -1/  -1  0
> > > ANON: 3104   0/3399   -1/  -1  0
> > > RMAP: 4946   0/5885   -1/  -1  0
> > > SHM:  7138   0/7138   -1/  -1  0
> > 
> > Sorry, I tend to go to sleep when presented with rows and rows of
> > numbers. Sure, it's good to show the data but I much prefer it if the
> > sender can tell us what the data means: the executive summary.
> 
> sorry, I'm more the technical person and I hate
> 'executive summaries' and similar stuff, but the
> message is simple and clear: accouting works even
> for shared/unified guests, all three guests show
> reasonably similar values ...

I don't see "accounting" as being useful for resource managment.  I mean,
so we have a bunch of numbers - so what?

The problem is: what do we do when the jobs in a container exceed their
allotment?

With zone-based physical containers we already have

Re: About GCC4 Optimization

2007-03-24 Thread Linus Torvalds


On Sun, 25 Mar 2007, yuan cooper wrote:
> �
> during my work, I found�there is a bug with GCC4 O2 optimization.

Technically, it's a misfeature fo gcc4, not a bug.

The C language allows for type-based alias detection, and gcc notices that 
a "float *" cannot ever alias with a "unsigned long *", so it decides to 
not even do the loads and stores..

Now, there's two things wrong with this picture:

 - gcc is being an ass. type-based alias detection should happen only as a 
   last resort, and gcc should know and notice that *despite* the types 
   being different, they definitely alias.

   So what gcc does may be technically legal, but it's still a horribly 
   bad thing to do. Sadly, some gcc people seem to care more about "letter 
   of the law" than "sanity and quality of implementation".

 - as a result, you should always compile any kernel stuff with 
   "-fno-strict-aliasing", which should turn this off. If it *still* 
   happens with that flag, then it is indeed a compiler bug.

> float ftmp;
> unsigned long tmp;
> ftmp = 1.0/1024.0;
> tmp� = *(unsigned long *)(&ftmp);
> tmp� = (tmp >> 11) && 0xFFF;
> �
> if optimization level is O2, gcc will MOV eax to tmp, but current eax has a 
> random value.
> -O is ok and gcc3 with O2 is ok too.

That said, you really _really_ shouldn't be doing FP in the kernel anyway.

Linus

Re: [ckrm-tech] [PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code

2007-03-24 Thread Srivatsa Vaddagiri
On Sun, Mar 25, 2007 at 07:58:16AM +0530, Srivatsa Vaddagiri wrote:
> Not just this, continuing further we have more trouble:
> 
> 
> CPU0 (attach_task T1 to CS2)  CPU1 (T1 is exiting)
> 
> 
> synchronize_rcu()
>   atomic_dec(&CS1->count);
>   [CS1->count = 0]
> 
> if atomic_dec_and_test(&oldcs->count))
>   [CS1->count = -1]
> 
> 
> 
> We now have CS1->count negative. Is that good? I am uncomfortable ..
> 
> We need a task_lock() in cpuset_exit to avoid this race.

2nd race is tricky. We probably need to do this to avoid it:

task_lock(tsk);

/* Check if tsk->cpuset is still same. We may have raced with 
 * cpuset_exit changing tsk->cpuset again under our feet.
 */
if (tsk->cpuset == cs && atomic_dec_and_test(&oldcs->count)) {
task_unlock(tsk);
check_for_release(oldcs, ppathbuf);
goto done;
}

task_unlock(tsk);

done:
return 0;



-- 
Regards,
vatsa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [3/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread Eric W. Biederman
Thomas Meyer <[EMAIL PROTECTED]> writes:

> Eric W. Biederman schrieb:
>>
>> Odd.  I would have thought the oops happened in the first resume, not
>> the second. 
>>
>> Hmm.  It may have something to do with the ``managed'' driver
>> aspect of this as well..
>>   
> No. I don't think so. The problem is caused by this sequence: (the info
> is always before entry of a function and before the exit of a function):

Ok.  Thanks.   It is the ordering of events that keeps it
from showing up.  The problem happens the first time but only
after we have restored msi state so we don't see the ill effects
until the second time.


Ok staring at the code and thinking about the problem.  The only
thing that pci_enable_device does (except messing with irqs is
flip enable bits).   Further pci_enable_device only messes with
on 5 architectures.   Only ia64 really cares.  i386 and x86_64
it is simply delaying work until we need it.  frv doesn't
really care it just pokes the irq value back into the hardware
for some reason.  cris just sets a hard coded value.  Does cris
only have one pci irq?

So I think the right solution is to simply make pci_enable_device
just flip enable bits and move the rest of the work someplace else.

However a thorough cleanup is a little extreme for this point in
the release cycle, so I think a quick hack that makes the code
not stomp the irq when msi irq's are enabled should be the first
fix.  Then we can later make the code not change the irqs at all.

Thomas could you verify the patch below makes the problem go away
for you.

Tony, Len the way pci_disable_device is being used in a suspend/resume
path by a few drivers is completely incompatible with the way irqs
are allocated on ia64.  In particular people the following sequence
occurs in several drivers.

probe:
  pci_enable_device(pdev);
  request_irq(pdev->irq);
suspend:
  pci_disable_device(pdev);
resume:
  pci_enable_device(pdev);
remove:
  free_irq(pdev->irq);
  pci_disable_device(pdev);

What I'm proposing we do is move the irq allocation code out of
pci_enable_device and the irq freeing code out of pci_disable_device
in the future.  If we move ia64 to a model where the irq number equal
the gsi like we have for x86_64 and are in the middle of for i386 that
should be pretty straight forward.  It would even be relatively simple
to delay vector allocation in that context until request_irq, if we
needed the delayed allocation benefit.   Do you two have any problems
with moving in that direction?

If fixing the arch code is unacceptable for some reason I'm not aware
of we need to audit the 10-20 drivers that call pci_disable_device
in their suspend/resume processing and ensure that they have freed
all of the irqs before that point.  Given that I have bug reports on
the msi path I know that isn't true.

Tony, Len before we merge any fixes for 2.6.21-rcX I'd like to at
least get an ack on the long term direction.

Thanks,
Eric


diff --git a/arch/cris/arch-v32/drivers/pci/bios.c 
b/arch/cris/arch-v32/drivers/pci/bios.c
index a2b9c60..5b79a7a 100644
--- a/arch/cris/arch-v32/drivers/pci/bios.c
+++ b/arch/cris/arch-v32/drivers/pci/bios.c
@@ -100,7 +100,9 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
if ((err = pcibios_enable_resources(dev, mask)) < 0)
return err;
 
-   return pcibios_enable_irq(dev);
+   if (!dev->msi_enabled)
+   pcibios_enable_irq(dev);
+   return 0;
 }
 
 int pcibios_assign_resources(void)
diff --git a/arch/frv/mb93090-mb00/pci-vdk.c b/arch/frv/mb93090-mb00/pci-vdk.c
index f7279d7..0b581e3 100644
--- a/arch/frv/mb93090-mb00/pci-vdk.c
+++ b/arch/frv/mb93090-mb00/pci-vdk.c
@@ -466,6 +466,7 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 
if ((err = pcibios_enable_resources(dev, mask)) < 0)
return err;
-   pcibios_enable_irq(dev);
+   if (!dev->msi_enabled)
+   pcibios_enable_irq(dev);
return 0;
 }
diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c
index 1bb0693..a990a6c 100644
--- a/arch/i386/pci/common.c
+++ b/arch/i386/pci/common.c
@@ -426,11 +426,13 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
if ((err = pcibios_enable_resources(dev, mask)) < 0)
return err;
 
-   return pcibios_enable_irq(dev);
+   if (!dev->msi_enabled)
+   return pcibios_enable_irq(dev);
+   return 0;
 }
 
 void pcibios_disable_device (struct pci_dev *dev)
 {
-   if (pcibios_disable_irq)
+   if (!dev->msi_enabled && pcibios_disable_irq)
pcibios_disable_irq(dev);
 }
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 474d179..f8bcccd 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -557,14 +557,18 @@ pcibios_enable_device (struct pci_dev *dev, int mask)
if (ret < 0)
return ret;
 
-   return acpi_pci_irq_enable(dev);
+   if (!dev->msi_enabled)
+   return acpi_pci_irq_enable(dev);

Re: keyboard.c: Stop flooding dmesg with useless warnings

2007-03-24 Thread Parag Warudkar




Signed-off-by: Parag Warudkar <[EMAIL PROTECTED]>

--- linux-2.6-wk/drivers/char/keyboard.c	2007-03-24 23:01:19.0 
-0400

+++ linux-2.6/drivers/char/keyboard.c   2007-03-24 21:43:58.0 -0400
@@ -1161,7 +1161,7 @@

if ((raw_mode = (kbd->kbdmode == VC_RAW)) && !hw_raw)
if (emulate_raw(vc, keycode, !down << 7))
-   if (keycode < BTN_MISC && keycode != KEY_RESERVED)
+   if (keycode < BTN_MISC)
printk(KERN_WARNING "keyboard.c: can't 
emulate rawmode for keycode %d\n", keycode);


#ifdef CONFIG_MAGIC_SYSRQ  /* Handle the SysRq Hack */



Yikes. Wrong one above. Right one below. Against latest git.

I use  Apple keyboard and mouse which seem to generate events with
keycode==0.

keyboard.c floods dmesg endlessly with below messages. This happens at a
very fast rate and never stops, leaving the dmesg unusable.

[46591.96] keyboard.c: can't emulate rawmode for keycode 0
[46591.996000] keyboard.c: can't emulate rawmode for keycode 0
[46592.032000] keyboard.c: can't emulate rawmode for keycode 0
[46592.068000] keyboard.c: can't emulate rawmode for keycode 0
[46592.104000] keyboard.c: can't emulate rawmode for keycode 0
[46592.14] keyboard.c: can't emulate rawmode for keycode 0
[46592.176000] keyboard.c: can't emulate rawmode for keycode 0
[46592.212000] keyboard.c: can't emulate rawmode for keycode 0
[46592.248000] keyboard.c: can't emulate rawmode for keycode 0

The patch below avoids printing the warning if keycode == KEY_RESERVED.

If a more correct fix is possible please let me know and I will redo this.
(I suspect avoiding call to emulate_raw() with value ranges it cannot
emulate might be a better fix?)

Signed-off-by: Parag Warudkar <[EMAIL PROTECTED]>

--- linux-2.6/drivers/char/keyboard.c   2007-03-24 21:43:58.0 -0400
+++ linux-2.6-wk/drivers/char/keyboard.c2007-03-24 23:01:19.0 
-0400
@@ -1161,7 +1161,7 @@

if ((raw_mode = (kbd->kbdmode == VC_RAW)) && !hw_raw)
if (emulate_raw(vc, keycode, !down << 7))
-   if (keycode < BTN_MISC)
+   if (keycode < BTN_MISC && keycode != KEY_RESERVED)
printk(KERN_WARNING "keyboard.c: can't emulate 
rawmode for keycode %d\n", keycode);

 #ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


keyboard.c: Stop flooding dmesg with useless warnings

2007-03-24 Thread Parag Warudkar


I use  Apple keyboard and mouse which seem to generate events with 
keycode==0.


keyboard.c floods dmesg endlessly with below messages. This happens at a 
very fast rate and never stops, leaving the dmesg unusable.


[46591.96] keyboard.c: can't emulate rawmode for keycode 0
[46591.996000] keyboard.c: can't emulate rawmode for keycode 0
[46592.032000] keyboard.c: can't emulate rawmode for keycode 0
[46592.068000] keyboard.c: can't emulate rawmode for keycode 0
[46592.104000] keyboard.c: can't emulate rawmode for keycode 0
[46592.14] keyboard.c: can't emulate rawmode for keycode 0
[46592.176000] keyboard.c: can't emulate rawmode for keycode 0
[46592.212000] keyboard.c: can't emulate rawmode for keycode 0
[46592.248000] keyboard.c: can't emulate rawmode for keycode 0

The patch below avoids printing the warning if keycode == KEY_RESERVED.

If a more correct fix is possible please let me know and I will redo this.
(I suspect avoiding call to emulate_raw() with value ranges it cannot 
emulate might be a better fix?)


Otherwise please consider applying.

Signed-off-by: Parag Warudkar <[EMAIL PROTECTED]>

--- linux-2.6-wk/drivers/char/keyboard.c2007-03-24 23:01:19.0 
-0400
+++ linux-2.6/drivers/char/keyboard.c   2007-03-24 21:43:58.0 -0400
@@ -1161,7 +1161,7 @@

if ((raw_mode = (kbd->kbdmode == VC_RAW)) && !hw_raw)
if (emulate_raw(vc, keycode, !down << 7))
-   if (keycode < BTN_MISC && keycode != KEY_RESERVED)
+   if (keycode < BTN_MISC)
printk(KERN_WARNING "keyboard.c: can't emulate 
rawmode for keycode %d\n", keycode);

 #ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux-VServer example results for sharing vs. separate mappings ...

2007-03-24 Thread Herbert Poetzl
On Sat, Mar 24, 2007 at 12:19:06PM -0800, Andrew Morton wrote:
> On Sat, 24 Mar 2007 19:38:06 +0100 Herbert Poetzl <[EMAIL PROTECTED]> wrote:
> 
> > On Fri, Mar 23, 2007 at 09:42:35PM -0800, Andrew Morton wrote:
> > > On Fri, 23 Mar 2007 20:30:00 +0100 Herbert Poetzl <[EMAIL PROTECTED]> 
> > > wrote:
> > > 
> > > > 
> > > > Hi Eric!
> > > > Hi Folks!
> > > > 
> > > > here is a real world example result from one of my tests
> > > > regarding the benefit of sharing over separate memory
> > > > 
> > > > the setup is quite simple, a typical machine used by
> > > > providers all over the world, a dual Pentium D 3.2GHz
> > > > with 4GB of memory and a single 160GB SATA disk running
> > > > a Linux-VServer kernel (2.6.19.7-vs2.2.0-rc18)
> > > > 
> > > > the Guest systems used are Mandriva 2007 guests with
> > > > syslog, crond, sshd, apache, postfix and postgresql
> > > > installed and running (all in all 17 processes per guest)
> > > > 
> > > > the disk space used by one guests is roughly 148MB
> > > > 
> > > > in addition to that, a normal host system is running
> > > > with a few daemons (like sshd, httpd, postfix ...)
> > > > 
> > > > 
> > > > the first test setup is starting 200 of those guests
> > > > one after the other and measuring the memory usage
> > > > before and after the guest did start, as well as 
> > > > recording the time used to start them ...
> > > > 
> > > > this is done right after the machine was rebooted, in
> > > > one test with 200 separate guests (i.e. 200 x 148MB) 
> > > > and in a second run with 200 unified guests (which
> > > > means roughly 138MB of shared files)
> > > 
> > > Please define your terms.  
> > > What is a "separated guest", what is a "unified guest" 
> > > and how do they differ?
> > 
> > separated guests are complete Linux Distributions which
> > do not share (filesystem wise) anything with any other
> > guest ... i.e. all files and executables have to be
> > paged in and get separate mappings (and thus separate
> > memory)
> > 
> > unified guests use a mechanism we (Linux-VServer) call
> > 'unification' which can be considered an advanced form
> > of hard linking (i.e. we add special flags to protect
> > those hard links from modification. such a file is 
> > copied on demand (CoW Link Breaking) on the first attempt
> > to be modified (attributes or content)
> 
> OK.
> 
> > > If a "separated" guest is something in which separate 
> > > guests will use distinct physical pages to cache the 
> > > contents of /etc/passwd (ie: a separate filesystem 
> > > per guest) then I don't think that's interesting 
> > > information, frankly.
> > 
> > well, you didn't bother to answer my questions regarding
> > your suggested approach yet,
> 
> Have been a bit distracted lately, and these discussions 
> seem to go on an on without ever converging.

well, it's never easy if there are different ideologies
try to find a common denominator, but contrary to you,
I have the feeling that progress is made ...

> > and as I am concerned that
> > some of the suggested approaches sacrifice performance
> > and resource sharing/efficiency for simplicity or (as
> > we recently had) 'ability to explain it to the customer'
> 
> The problem is memory reclaim.  A number of schemes which 
> have been proposed require a per-container page reclaim 
> mechanism - basically a separate scanner.
> 
> This is a huge, huge, huge problem.  The present scanner
> has been under development for over a decade and has had
> tremendous amounts of work and testing put into it.
> And it still has problems.  But those problems will be 
> gradually addressed.
> 
> A per-container recaim scheme really really really wants 
> to reuse all that stuff rather than creating a separate,
> parallel, new scanner which has the same robustness 
> requirements, only has a decade less test and development
> done on it.  And which permanently doubles our maintenance
> costs.

I completely agree here

> So how do we reuse our existing scanner?  With physical containers. 
> One can envisage several schemes:
> 
> a) slice the machine into 128 fake NUMA nodes, use each node as the
>basic block of memory allocation, manage the binding between these
>memory hunks and process groups with cpusets.

128 sounds a little small to me, considering that we
already see 300+ Guests on older machines 
(or am I missing something here?)

>This is what google are testing, and it works.
> 
> b) Create a new memory abstraction, call it the "software zone",
>which is mostly decoupled from the present "hardware zones". Most of
>the MM is reworked to use "software zones". The "software zones" are
>runtime-resizeable, and obtain their pages via some means from the
>hardware zones. A container uses a software zone.
> 
> c) Something else, similar to the above.  Various schemes can be
>envisaged, it isn't terribly important for this discussion.

for me, the most natural approach is the one with 
the least impact and smalles

Re: [ckrm-tech] [PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code

2007-03-24 Thread Srivatsa Vaddagiri
On Sat, Mar 24, 2007 at 06:41:28PM -0700, Paul Jackson wrote:
> > the following code becomes racy with cpuset_exit() ...
> > 
> > atomic_inc(&cs->count);
> > rcu_assign_pointer(tsk->cpuset, cs);
> > task_unlock(tsk);
> 
> eh ... so ... ?
> 
> I don't know of any sequence where that causes any problem.
> 
> Do you see one?

Let's say we had two cpusets CS1 amd CS2 (both different from top_cpuset).
CS1 has just one task T1 in it (CS1->count = 0) while CS2 has no tasks
in it (CS2->count = 0).

Now consider:


CPU0 (attach_task T1 to CS2)CPU1 (T1 is exiting)


task_lock(T1);

oldcs = tsk->cpuset;
[oldcs = CS1]

T1->flags & PF_EXITING? (No)

T1->flags = PF_EXITING;

atomic_inc(&CS2->count);

cpuset_exit()
cs = tsk->cpuset; (cs = CS1)

T1->cpuset = CS2;

T1->cpuset = &top_cpuset;

task_unlock(T1);


CS2 has one bogus count now (with no tasks in it), which may prevent it from 
being removed/freed forever.


Not just this, continuing further we have more trouble:


CPU0 (attach_task T1 to CS2)CPU1 (T1 is exiting)


synchronize_rcu()
atomic_dec(&CS1->count);
[CS1->count = 0]

if atomic_dec_and_test(&oldcs->count))
[CS1->count = -1]



We now have CS1->count negative. Is that good? I am uncomfortable ..

We need a task_lock() in cpuset_exit to avoid this race.

-- 
Regards,
vatsa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [RFC] sched: accurate user accounting

2007-03-24 Thread Con Kolivas
On Sunday 25 March 2007 11:59, Con Kolivas wrote:
> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
>
> ---
> Currently we only do cpu accounting to userspace based on what is actually
> happening precisely on each tick. The accuracy of that accounting gets
> progressively worse the lower HZ is. As we already keep accounting of
> nanosecond resolution we can accurately track user cpu, nice cpu and idle
> cpu if we move the accounting to update_cpu_clock with a nanosecond
> cpu_usage_stat entry. This increases overhead slightly but avoids the
> problem of tick aliasing errors making accounting unreliable.

Vale, this fixes your testcase you sent. Attached below for reference. 

P.S. Sorry about one of the cc email addresses in the first email; I succumbed 
to a silly practical joke unwittingly so you'll have to remove it when 
replying to all.

/* gcc -o hog smallhog.c */
#include 
#include 
#include 
#include 

#define HIST 10

static sig_atomic_t stop;

static void sighandler (int signr)
{
 (void) signr;
 stop = 1;
}

static unsigned long hog (unsigned long niters)
{
 stop = 0;
 while (!stop && --niters);
 return niters;
}

int main (void)
{
 int i;
 struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
 .it_value = { .tv_sec = 0, .tv_usec = 1 } };
 sigset_t set;
 unsigned long v[HIST];
 double tmp = 0.0;
 unsigned long n;

 signal (SIGALRM, &sighandler);
 setitimer (ITIMER_REAL, &it, NULL);

 for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
 for (i = 0; i < HIST; ++i) tmp += v[i];
 tmp /= HIST;
 n = tmp - (tmp / 3.0);

 sigemptyset (&set);
 sigaddset (&set, SIGALRM);

 for (;;) {
 hog (n);
 sigwait (&set, &i);
 }
 return 0;
}

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] cosmetic - minor typos in drivers/isdn/capi/Kconfig

2007-03-24 Thread Patrick Ringl

Hello,

[ ... fixed one minor typo and adapted the word order ... ]


--- linux-2.6.20-o/drivers/isdn/capi/Kconfig2007-03-18 
00:04:54.0 +0100
+++ linux-2.6.20/drivers/isdn/capi/Kconfig2007-03-25 
04:10:24.0 +0200

@@ -17,7 +17,7 @@
help
  If you say Y here, the kernelcapi driver can make verbose traces
  of CAPI messages. This feature can be enabled/disabled via IOCTL for
-  every controler (default disabled).
+  every controller (disabled by default).
  This will increase the size of the kernelcapi module by 20 KB.
  If unsure, say Y.



---


regards,
Patrick
--- linux-2.6.20-o/drivers/isdn/capi/Kconfig2007-03-18 00:04:54.0 
+0100
+++ linux-2.6.20/drivers/isdn/capi/Kconfig  2007-03-25 04:10:24.0 
+0200
@@ -17,7 +17,7 @@
help
  If you say Y here, the kernelcapi driver can make verbose traces
  of CAPI messages. This feature can be enabled/disabled via IOCTL for
- every controler (default disabled).
+ every controller (disabled by default).
  This will increase the size of the kernelcapi module by 20 KB.
  If unsure, say Y.
 


Re: [PATCH 2.6.20.3] Flush writes to MSI-X table

2007-03-24 Thread Roland Dreier
 > I don't understand your need to try to rush an api change like this in
 > so quickly in an area that has a lot of churn and disagreement lately.
 > _Especially_ so late in the release cycle, and with no hardware publicly
 > availble.

I'm not sure I understood this thread properly, but if I did understand
correctly then this bug affects IRQ balancing on any device with MSI-X
enabled.  In which case, there's plenty of publicly available hardware
with MSI-X support (including drivers in the mainline tree for a long
time).  A quick grep for pci_enable_msix finds plenty of drivers using
MSI-X now: cciss, ib_mthca, cxgb3, forcedeth, s2io, qla2xxx.

 - R.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] [RFC] sched: accurate user accounting

2007-03-24 Thread Con Kolivas
For an rsdl 0.33 patched kernel. Comments? Overhead worth it?

---
Currently we only do cpu accounting to userspace based on what is actually
happening precisely on each tick. The accuracy of that accounting gets
progressively worse the lower HZ is. As we already keep accounting of
nanosecond resolution we can accurately track user cpu, nice cpu and idle cpu
if we move the accounting to update_cpu_clock with a nanosecond cpu_usage_stat
entry. This increases overhead slightly but avoids the problem of tick
aliasing errors making accounting unreliable.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 include/linux/kernel_stat.h |3 ++
 include/linux/sched.h   |2 -
 kernel/sched.c  |   51 +---
 kernel/timer.c  |5 +---
 4 files changed, 54 insertions(+), 7 deletions(-)

Index: linux-2.6.20.4-ck1/include/linux/kernel_stat.h
===
--- linux-2.6.20.4-ck1.orig/include/linux/kernel_stat.h 2007-03-25 
09:47:52.0 +1000
+++ linux-2.6.20.4-ck1/include/linux/kernel_stat.h  2007-03-25 
11:31:29.0 +1000
@@ -16,11 +16,14 @@
 
 struct cpu_usage_stat {
cputime64_t user;
+   cputime64_t user_ns;
cputime64_t nice;
+   cputime64_t nice_ns;
cputime64_t system;
cputime64_t softirq;
cputime64_t irq;
cputime64_t idle;
+   cputime64_t idle_ns;
cputime64_t iowait;
cputime64_t steal;
 };
Index: linux-2.6.20.4-ck1/kernel/sched.c
===
--- linux-2.6.20.4-ck1.orig/kernel/sched.c  2007-03-25 09:47:56.0 
+1000
+++ linux-2.6.20.4-ck1/kernel/sched.c   2007-03-25 11:42:28.0 +1000
@@ -77,6 +77,11 @@
 #define MAX_USER_PRIO  (USER_PRIO(MAX_PRIO))
 #define SCHED_PRIO(p)  ((p)+MAX_RT_PRIO)
 
+/*
+ * Some helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)((TIME) / (10 / HZ))
+#define JIFFIES_TO_NS(TIME)((TIME) * (10 / HZ))
 #define TASK_PREEMPTS_CURR(p, curr)((p)->prio < (curr)->prio)
 
 /*
@@ -2993,8 +2998,50 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-   p->sched_time += now - p->last_ran;
+   struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+   cputime64_t time_diff = now - p->last_ran;
+
+   p->sched_time += time_diff;
p->last_ran = rq->most_recent_timestamp = now;
+   if (p != rq->idle) {
+   cputime_t utime_diff = time_diff;
+
+   if (TASK_NICE(p) > 0) {
+   cpustat->nice_ns = cputime64_add(cpustat->nice_ns,
+time_diff);
+   if (NS_TO_JIFFIES(cpustat->nice_ns) > 1) {
+   cpustat->nice_ns =
+   cputime64_sub(cpustat->nice_ns,
+   JIFFIES_TO_NS(1));
+   cpustat->nice =
+   cputime64_add(cpustat->nice, 1);
+   }
+   } else {
+   cpustat->user_ns = cputime64_add(cpustat->user_ns,
+   time_diff);
+   if (NS_TO_JIFFIES(cpustat->user_ns) > 1) {
+   cpustat->user_ns =
+   cputime64_sub(cpustat->user_ns,
+   JIFFIES_TO_NS(1));
+   cpustat ->user =
+   cputime64_add(cpustat->user, 1);
+   }
+   }
+   p->utime_ns = cputime_add(p->utime_ns, utime_diff);
+   if (NS_TO_JIFFIES(p->utime_ns) > 1) {
+   p->utime_ns = cputime_sub(p->utime_ns,
+ JIFFIES_TO_NS(1));
+   p->utime = cputime_add(p->utime,
+  jiffies_to_cputime(1));
+   }
+   } else {
+   cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff);
+   if (NS_TO_JIFFIES(cpustat->idle_ns) > 1) {
+   cpustat->idle_ns = cputime64_sub(cpustat->idle_ns,
+JIFFIES_TO_NS(1));
+   cpustat->idle = cputime64_add(cpustat->idle, 1);
+   }
+   }
 }
 
 /*
@@ -3059,8 +3106,6 @@ void account_system_time(struct task_str
cpustat->system = cputime64_add(cpustat->system, tmp);
else if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-   else
-   cpustat->idle = cputime64_add(cpustat->idle, tmp);
/* 

[PATCH] Add additional error check to mm/mincore.c

2007-03-24 Thread Bruce Dubbs
I some circumstances, mincore can succeed when it shouldn't.

Example:
  Two files are mmapped to a process and they are adjacent in memory.
If mincore is run with a requested length that is too large, the
function does not differentiate between the different file pointers
within the different vma structures and inappropriately returns success.

The attached patch, against 2.6.20.3, fixes this behavior.

This behavior was found when running the Linux Test Project's mincore01
on an IA32 system.  Test 3 "unexpectedly" succeeds.

  -- Bruce
--- mm/mincore.c.old2007-03-24 19:55:01.0 -0500
+++ mm/mincore.c2007-03-24 20:13:43.0 -0500
@@ -43,7 +43,8 @@
  * all the arguments, we hold the mmap semaphore: we should
  * just return the amount of info we're asked for.
  */
-static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long 
pages)
+static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long 
pages,
+struct file** file_struct)
 {
unsigned long i, nr, pgoff;
struct vm_area_struct *vma = find_vma(current->mm, addr);
@@ -64,7 +65,19 @@
 * this is what we've traditionally done, so we'll just
 * continue doing it.
 */
-   if (!vma->vm_file)
+
+/* 
+ * Initialize file pointer to the value in the first vma structure
+ */
+
+if ( *file_struct == NULL && vma->vm_file )
+*file_struct = vma->vm_file;
+
+/*
+ * Return an error if the is no file mapped of the file is different
+ */
+ 
+   if (!vma->vm_file || vma->vm_file != *file_struct)
return -ENOMEM;
 
/*
@@ -115,6 +128,7 @@
long retval;
unsigned long pages;
unsigned char *tmp;
+static struct file* file = NULL;
 
/* Check the start address: needs to be page-aligned.. */
if (start & ~PAGE_CACHE_MASK)
@@ -142,7 +156,7 @@
 * the temporary buffer size.
 */
down_read(¤t->mm->mmap_sem);
-   retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
+   retval = do_mincore(start, tmp, min(pages, PAGE_SIZE), &file);
up_read(¤t->mm->mmap_sem);
 
if (retval <= 0)


Re: [ckrm-tech] [PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code

2007-03-24 Thread Paul Jackson
vatsa wrote:
> > if (tsk->flags & PF_EXITING) {
> 
> What if PF_EXITING is set after this check? If that happens then,
> 
> > task_unlock(tsk);
> > mutex_unlock(&callback_mutex);
> > put_task_struct(tsk);
> > return -ESRCH;
> > }
> 
> the following code becomes racy with cpuset_exit() ...
> 
> atomic_inc(&cs->count);
> rcu_assign_pointer(tsk->cpuset, cs);
> task_unlock(tsk);

eh ... so ... ?

I don't know of any sequence where that causes any problem.

Do you see one?

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH UPDATED] fix typo in drivers/block/Kconfig

2007-03-24 Thread Patrick Ringl

Hello,

[ ... Fixed two minor typos. ...]

The following patch is against 2.6.21-rc4:


--- linux-2.6.20-o/drivers/block/Kconfig2007-03-18 00:04:53.0 
+0100
+++ linux-2.6.20/drivers/block/Kconfig  2007-03-25 03:35:19.0 +0200
@@ -383,7 +383,7 @@
default "16"
depends on BLK_DEV_RAM
help
- The default value is 16 RAM disks. Change this if you know what
+ The default value is 16 RAM disks. Change this if you know what you
  are doing. If you boot from a filesystem that needs to be extracted
  in memory, you will need at least one RAM disk (e.g. root on cramfs).

@@ -393,7 +393,7 @@
default "4096"
help
  The default value is 4096 kilobytes. Only change this if you know
- what are you doing. If you are using IBM S/390, then set this to
+ what you are doing. If you are using IBM S/390, then set this to
  8192.

 config BLK_DEV_RAM_BLOCKSIZE




---

regards,
Patrick




--- linux-2.6.20-o/drivers/block/Kconfig2007-03-18 00:04:53.0 
+0100
+++ linux-2.6.20/drivers/block/Kconfig  2007-03-25 03:35:19.0 +0200
@@ -383,7 +383,7 @@
default "16"
depends on BLK_DEV_RAM
help
- The default value is 16 RAM disks. Change this if you know what
+ The default value is 16 RAM disks. Change this if you know what you
  are doing. If you boot from a filesystem that needs to be extracted
  in memory, you will need at least one RAM disk (e.g. root on cramfs).
 
@@ -393,7 +393,7 @@
default "4096"
help
  The default value is 4096 kilobytes. Only change this if you know
- what are you doing. If you are using IBM S/390, then set this to
+ what you are doing. If you are using IBM S/390, then set this to
  8192.
 
 config BLK_DEV_RAM_BLOCKSIZE


[PATCH] fix typo in drivers/block/Kconfig

2007-03-24 Thread Patrick Ringl

Hello,
this is just a QA / cosmetic fix .. nevertheless the documentation about
modules / drivers should be appropriate to the great work of those who
write all the real important stuff. :-)

The following patch is against 2.6.21-rc4:


--- linux-2.6.20-o/drivers/block/Kconfig2007-03-18 00:04:53.0 
+0100
+++ linux-2.6.20/drivers/block/Kconfig  2007-03-25 00:57:29.0 +0100
@@ -383,7 +383,7 @@
default "16"
depends on BLK_DEV_RAM
help
- The default value is 16 RAM disks. Change this if you know what
+ The default value is 16 RAM disks. Change this if you know what you
  are doing. If you boot from a filesystem that needs to be extracted
  in memory, you will need at least one RAM disk (e.g. root on cramfs).



---

regards,
Patrick



--- linux-2.6.20-o/drivers/block/Kconfig2007-03-18 00:04:53.0 
+0100
+++ linux-2.6.20/drivers/block/Kconfig  2007-03-25 00:57:29.0 +0100
@@ -383,7 +383,7 @@
default "16"
depends on BLK_DEV_RAM
help
- The default value is 16 RAM disks. Change this if you know what
+ The default value is 16 RAM disks. Change this if you know what you
  are doing. If you boot from a filesystem that needs to be extracted
  in memory, you will need at least one RAM disk (e.g. root on cramfs).
 


Re: [BUG] Code reordering in swsusp breaks suspend on SMP systems

2007-03-24 Thread Maxim
On Friday 23 March 2007 16:42:44 Rafael J. Wysocki wrote:
> On Friday, 23 March 2007 00:30, Rafael J. Wysocki wrote:
> > On Thursday, 22 March 2007 00:53, Rafael J. Wysocki wrote:
> > > On Thursday, 22 March 2007 00:39, Maxim wrote:
> > > > On Thursday 22 March 2007 01:24:25 Rafael J. Wysocki wrote:
> > > > > On Thursday, 22 March 2007 00:09, Maxim wrote:
> > > > > > On Thursday 22 March 2007 00:39:02 you wrote:
> > > > > > > On Wednesday, 21 March 2007 23:21, Pavel Machek wrote:
> > > > > > > > Hi!
> > > > > > > > 
> > > > > > > > > Starting with 2.6.21-rc1 suspend to ram and disk doesn't work 
> > > > > > > > > anymore on my system.
> > > > > > > > > 
> > > > > > > > > I did a git-bisect and found that those commits break it:
> > > > > > > > > 
> > > > > > > > > e3c7db621bed4afb8e231cb005057f2feb5db557 - [PATCH] [PATCH] 
> > > > > > > > > PM: Change code ordering in main.c
> > > > > > > > > ed746e3b18f4df18afa3763155972c5835f284c5 - [PATCH] [PATCH] 
> > > > > > > > > swsusp: Change code ordering in disk.c
> > > > > > > > > 259130526c267550bc365d3015917d90667732f1 - [PATCH] [PATCH] 
> > > > > > > > > swsusp: Change code ordering in user.c
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > (Yep, it was in my "to analyze" queue).
> > > > > > > > 
> > > > > > > > > I already reported about it, but now i know the reason why 
> > > > > > > > > suspend breaks.
> > > > > > > > > 
> > > > > > > > > The problem is that both cpu_up/cpu_down were allowed to 
> > > > > > > > > sleep until now, 
> > > > > > > > > and it did work because those functions could be called only 
> > > > > > > > > in process context
> > > > > > > > > (the one that writes to /sys/devices/system/cpu/cpu*/online) 
> > > > > > > > > or  idle thread  that does smp_init()).
> > > > > > > > > 
> > > > > > > > > But now they are called _after_ all tasks were suspended, so 
> > > > > > > > > if cpu_down tries for example to take a lock
> > > > > > > > > that is taken by different process, it can't since the 
> > > > > > > > > different proccess is frozen and can't release the lock.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Thanks for detailed explanation.
> > > > > > > > 
> > > > > > > > ...but, on my machine suspend works ok in -rc4. I'm not seeing 
> > > > > > > > this.
> > > > > > > > 
> > > > > > > > ...by design, "frozen" tasks must not hold any locks. If frozen 
> > > > > > > > task
> > > > > > > > holds a lock, that's a bug.
> > > > > > > > 
> > > > > > > > > Or, it is also possible to revert this change.
> > > > > > > > 
> > > > > > > > Are you using xfs?
> > > > > > > 
> > > > > > > Well, this is the only case that can trigger it.  There are no 
> > > > > > > other freezable
> > > > > > > workqueues.
> > > > > > > 
> > > > > > > Greetings,
> > > > > > > Rafael
> > > > > > > 
> > > > > > 
> > > > > > Hello,
> > > > > > 
> > > > > > Yes, you are right and it is XFS
> > > > > > 
> > > > > > System suspends and resumes with xfs and your patch correctly,
> > > > > 
> > > > > Could you please sent this information to the list?  I'd like it to 
> > > > > reach all
> > > > > of the CCed parites. ;-)
> > > > 
> > > > I did now ( sorry I just keep using this Answer command, instead of 
> > > > Answer to everybody)
> > > > I didn't intend to send private email.
> > > > > 
> > > > > > Of course I need to mention that I had to unload microcode 
> > > > > > update driver because it prevented resume,
> > > > > > because it calls firmware loader helper, and again sleeps on 
> > > > > > lock
> > > > > 
> > > > > This is interesting.  Did it happen before or is it a regression?
> > > > 
> > > > It is from the same group of bugs , I mean hang because cpu_up/down is 
> > > > called with frozen tasks
> > > > Of course it didn't happen before those reordering commits were 
> > > > introduced
> > > 
> > > Well, we want cpu_up/down to be called after processes have been frozen, 
> > > for
> > > various reasons (one of them being that applications shouldn't see us 
> > > playing
> > > with the CPUs).
> > > 
> > > Thanks for reporting this, I'll have a look at the microcode update 
> > > driver.
> > 
> > Well, I have invented the appended workaround, but I'm not sure how much
> > sense it makes with respect to the microcode driver.  At least, it doesn't
> > break my AMD64 SMP setup. ;-)
> 
> Modified version of the patch is appended.  Unfortunately I have no hardware
> supporting the microcode updates.
> 
> Greetings,
> Rafael
> 
> 
> ---
>  arch/i386/kernel/microcode.c |   28 +---
>  include/linux/cpu.h  |2 ++
>  kernel/cpu.c |   32 
>  3 files changed, 43 insertions(+), 19 deletions(-)
> 
> Index: linux-2.6.21-rc4/arch/i386/kernel/microcode.c
> ===
> --- linux-2.6.21-rc4.orig/arch/i386/kernel/microcode.c
> +++ linux-2.6.21-rc4/arch/i386/kernel/microcode.c
> @@ -567,6 +567,16 @@ static int cpu

Re: pciehp: Cannot get control of hotplug hardware

2007-03-24 Thread Greg KH
On Sat, Mar 10, 2007 at 10:55:16AM -0500, Ryan Hope wrote:
> Ever since I started playing with suspend I started turning on PCI Hot
> Plug support since then I have been seeing messages like whats
> below from dmesg I'm not exactly sure how this actually impacts me
> if it does at all. I just thought it didn't look exactly right so I
> wanted to inquire about it. Does anyone know what is going on here?
> 
> -Ryan
> 
> pci_hotplug: PCI Hot Plug PCI Core version: 0.5
> acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
> decode_hpp: Could not get hotplug parameters. Use defaults
> pciehp: HPC vendor_id 8086 device_id 27d0 ss_vid 0 ss_did 0
> Evaluate _OSC Set fails. Status = 0x0005
> Evaluate _OSC Set fails. Status = 0x0005
> pciehp: Cannot get control of hotplug hardware for pci :00:1c.0

Your pci express hotplug controller does not support the proper hotplug
capabilities.  It's probably a bios issue as Windows doesn't support
this just yet (I think Vista now does), so those functions were usually
never tested.

You should be able to just ignore them, unless you want to use your pci
express hotplug functionality.  What kind of hardware is this, a laptop
with expresscard?  Or something else?

thanks,

greg k-h
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ckrm-tech] [PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code

2007-03-24 Thread Srivatsa Vaddagiri
On Sat, Mar 24, 2007 at 12:25:59PM -0700, Paul Jackson wrote:
> > P.S : cpuset.c checks for PF_EXITING twice in attach_task(), while this
> > patch seems to be checking only once. Is that fine?
> 
> I think the cpuset code is ok, because, as you note, it locks the task,
> picks off the cpuset pointer, and then checks a second time that the
> task still does not have PF_EXITING set:

Well afaics, PF_EXITING is set for the exiting task w/o taking any lock, which
makes this racy always.

> In the kernel/cpuset.c code for attach_task():
> 
> task_lock(tsk);
> oldcs = tsk->cpuset;
> /*
>  * After getting 'oldcs' cpuset ptr, be sure still not exiting.
>  * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
>  * then fail this attach_task(), to avoid breaking top_cpuset.count.
>  */
> if (tsk->flags & PF_EXITING) {

What if PF_EXITING is set after this check? If that happens then,

> task_unlock(tsk);
> mutex_unlock(&callback_mutex);
> put_task_struct(tsk);
> return -ESRCH;
> }

the following code becomes racy with cpuset_exit() ...

atomic_inc(&cs->count);
rcu_assign_pointer(tsk->cpuset, cs);
task_unlock(tsk);


-- 
Regards,
vatsa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] max_loop limit

2007-03-24 Thread Ken Chen

On 3/23/07, Jan Engelhardt <[EMAIL PROTECTED]> wrote:

@@ -1383,7 +1380,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = &loop_dev[0]; lo < &loop_dev[max_loop]; lo++) {
+   list_for_each_entry(lo, &loop_devices, lo_list) {
mutex_lock(&lo->lo_ctl_mutex);


Don't you need to use loop_devices_lock to protect the linked list here?



+static struct loop_device *loop_find_dev(unsigned int number)
+{
+   struct loop_device *lo;
+   list_for_each_entry(lo, &loop_devices, lo_list)
+   if (lo->lo_number == number)
+   return lo;
+   return NULL;


Here too with spin lock??
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] max_loop limit

2007-03-24 Thread Ken Chen

On 3/23/07, Jan Engelhardt <[EMAIL PROTECTED]> wrote:

Sadly, it locks up the foreground process (losetup that would be), and I
have not yet figured out why. And the mpt regression elsewhere is
hindering me in finding out faster.


You need to tell the block layer that each loop device is a whole
block device, not a partition within another device. Otherwise, I
think it will cause a recursive mutex lock in block_dev.c:do_open().

This patch should fix the problem.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>

--- ./drivers/block/loop.c.orig 2007-03-24 17:05:51.0 -0700
+++ ./drivers/block/loop.c  2007-03-24 17:06:06.0 -0700
@@ -1464,6 +1464,7 @@

if ((lo = loop_find_dev(number)) == NULL) {
lo = loop_init_one(number);
+   *part = 0;
if (IS_ERR(lo))
return (void *)lo;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3] Trec driver.

2007-03-24 Thread Wink Saville
This is the Trec driver, Makefile, header files.
Enable trec in Kernel hacking configuration menu.

Signed-off-by: Wink Saville <[EMAIL PROTECTED]>
---
 drivers/Makefile   |1 +
 drivers/trec/Makefile  |5 +
 drivers/trec/trec.c|  404 
 include/asm-generic/trec.h |   17 ++
 include/asm-i386/trec.h|   33 
 include/asm-x86_64/trec.h  |   13 ++
 include/linux/trec.h   |   75 
 lib/Kconfig.debug  |7 +
 8 files changed, 555 insertions(+), 0 deletions(-)
 create mode 100644 drivers/trec/Makefile
 create mode 100644 drivers/trec/trec.c
 create mode 100644 include/asm-generic/trec.h
 create mode 100644 include/asm-i386/trec.h
 create mode 100644 include/asm-x86_64/trec.h
 create mode 100644 include/linux/trec.h

diff --git a/drivers/Makefile b/drivers/Makefile
index 3a718f5..01724c0 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -81,3 +81,4 @@ obj-$(CONFIG_GENERIC_TIME)+= clocksource/
 obj-$(CONFIG_DMA_ENGINE)   += dma/
 obj-$(CONFIG_HID)  += hid/
 obj-$(CONFIG_PPC_PS3)  += ps3/
+obj-$(CONFIG_TREC) += trec/
diff --git a/drivers/trec/Makefile b/drivers/trec/Makefile
new file mode 100644
index 000..d930b4d
--- /dev/null
+++ b/drivers/trec/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for Trace records.
+#
+
+obj-$(CONFIG_TREC) += trec.o
diff --git a/drivers/trec/trec.c b/drivers/trec/trec.c
new file mode 100644
index 000..0b04b71
--- /dev/null
+++ b/drivers/trec/trec.c
@@ -0,0 +1,404 @@
+/*
+ * Copyright (C) 2007 Saville Software, Inc.
+ *
+ * This code may be used for any purpose whatsoever,
+ * but no warranty of any kind is provided.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define TREC_DEBUG
+#ifdef TREC_DEBUG
+#define DPK(fmt, args...) printk(KERN_ERR "trec " fmt, ## args)
+#else
+#define DPK(fmt, args...)
+#endif
+
+struct trec_dev_struct
+{
+   struct  cdevcdev;   /* Character device 
structure */
+};
+
+MODULE_AUTHOR("Wink Saville");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/*
+ * Module parameters
+ */
+int major = 240;   /* 240 a "local/expermental" device number for the 
moment */
+int minor = 1;
+
+module_param(major, int, S_IRUGO);
+module_param(minor, int, S_IRUGO);
+
+/*
+ * Forward declarations
+ */
+static int trec_open(struct inode *inode, struct file *file);
+static int trec_release(struct inode *inode, struct file *file);
+
+/*
+ * File operations
+ */
+struct file_operations trec_f_ops = {
+   .owner  =   THIS_MODULE,
+   .open   =   trec_open,
+   .release=   trec_release,
+};
+
+struct trec_struct {
+   uint64_ttsc;
+   unsigned long   pc;
+   unsigned long   tsk;
+   unsigned intpid;
+   unsigned long   v1;
+   unsigned long   v2;
+};
+
+/*
+ * Change trec_buffer_struct.data to be a pointer to a PAGE in the future
+ */
+#define TREC_DATA_SIZE 0x200
+struct trec_buffer_struct {
+   struct trec_buffer_struct * next;
+   struct trec_struct *cur;
+   struct trec_struct *end;
+   struct trec_struct  data[TREC_DATA_SIZE];
+};
+
+/*
+ * Number of buffers must be a multiple of two so we can
+ * snapshot the buffers and the minimum should be 4.
+ */
+#defineTREC_COUNT 2
+struct trec_buffer_struct  trec_buffers[2][TREC_COUNT];
+inttrec_idx = 0;
+spinlock_t trec_lock = SPIN_LOCK_UNLOCKED;
+
+struct trec_buffer_struct *trec_buffer_cur = NULL;
+struct trec_buffer_struct *trec_buffer_snapshot = NULL;
+
+struct trec_dev_struct trec_dev;
+
+/**
+ * Print an address symbol if available to the buffer
+ * this is from traps.c
+ */
+static int snprint_address(char *b, int bsize, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+   unsigned long offset = 0, symsize;
+   const char *symname;
+   char *modname;
+   char *delim = ":";
+   int n;
+   char namebuf[128];
+
+   symname = kallsyms_lookup(address, &symsize, &offset, &modname, 
namebuf);
+   if (!symname) {
+   n = 0;
+   } else {
+   if (!modname)
+   modname = delim = "";   
+   n = snprintf(b, bsize, "0x%016lx %s%s%s%s+0x%lx/0x%lx",
+   address, delim, modname, delim, symname, offset, 
symsize);
+   }
+   return n;
+#else
+   return snprintf(b, bsize, "0x%016lx", address);
+#endif
+}
+
+/*
+ * Initialize the trec buffers
+ */
+void trec_init(void)
+{
+   int i;
+   int j;
+
+   DPK("trec: trec_init E\n");
+
+   for (i = 0; i < 2; i++) {
+   for (j = 0; j < TREC_COUNT; j++) {
+   struct trec_buffer_struct *trec = &trec_buffers[i][j];
+
+   trec->next = &trec_buffers[i][(j+1) % TREC_COUNT];
+

[PATCH 3/3] Initialize and use trec_snapshot and trec_print_snapshot.

2007-03-24 Thread Wink Saville
Trec's are initialized early in main.c and then dump
trec's in die(), panic() and do_page_fault().

Signed-off-by: Wink Saville <[EMAIL PROTECTED]>
---
 arch/x86_64/kernel/traps.c |5 +
 arch/x86_64/mm/fault.c |6 ++
 init/main.c|4 
 kernel/panic.c |5 +
 4 files changed, 20 insertions(+), 0 deletions(-)

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 09d2e8a..b4f1a36 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -547,9 +548,13 @@ void die(const char * str, struct pt_regs * regs, long err)
 {
unsigned long flags = oops_begin();
 
+   trec_snapshot();
+
if (!user_mode(regs))
report_bug(regs->rip);
 
+   trec_print_snapshot();
+
__die(str, regs, err);
oops_end(flags);
do_exit(SIGSEGV); 
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 6ada723..e92f6bc 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -535,6 +536,8 @@ no_context:
 
flags = oops_begin();
 
+   trec_snapshot();
+
if (address < PAGE_SIZE)
printk(KERN_ALERT "Unable to handle kernel NULL pointer 
dereference");
else
@@ -548,6 +551,9 @@ no_context:
__die("Oops", regs, error_code);
/* Executive summary in case the body of the oops scrolled away */
printk(KERN_EMERG "CR2: %016lx\n", address);
+   
+   trec_print_snapshot();
+
oops_end(flags);
do_exit(SIGKILL);
 
diff --git a/init/main.c b/init/main.c
index a92989e..46bc440 100644
--- a/init/main.c
+++ b/init/main.c
@@ -54,6 +54,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -517,6 +518,9 @@ asmlinkage void __init start_kernel(void)
early_boot_irqs_off();
early_init_irq_lock_class();
 
+   trec_init();
+   TREC0();
+
 /*
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
diff --git a/kernel/panic.c b/kernel/panic.c
index 623d182..52812f2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 int panic_on_oops;
 int tainted;
@@ -66,6 +67,8 @@ NORET_TYPE void panic(const char * fmt, ...)
 unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
 
+   trec_snapshot();
+
/*
 * It's possible to come here directly from a panic-assertion and not
 * have preempt disabled. Some functions called from here want
@@ -96,6 +99,8 @@ NORET_TYPE void panic(const char * fmt, ...)
smp_send_stop();
 #endif
 
+   trec_print_snapshot();
+
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
if (!panic_blink)
-- 
1.5.0.rc2

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] Documention for trace records (trec).

2007-03-24 Thread Wink Saville
Trec is a light weight tracing mechanism that places
trace information into a buffer. The contents of the
buffer is dumped when errors occurs or when enabled
via SYSRQ commands.

Signed-off-by: Wink Saville <[EMAIL PROTECTED]>
---
 Documentation/trec.txt |   87 
 1 files changed, 87 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/trec.txt

diff --git a/Documentation/trec.txt b/Documentation/trec.txt
new file mode 100644
index 000..2275edd
--- /dev/null
+++ b/Documentation/trec.txt
@@ -0,0 +1,87 @@
+Title  : Trace Records
+Authors: Wink Saville <[EMAIL PROTECTED]>
+
+CONTENTS
+
+1. Concepts
+2. Architectures Supported
+3. Configuring
+4. API Reference
+5. Overhead
+6. TODO
+
+
+1. Concepts
+
+Trace records are a light weight tracing technique that time stamps
+small amounts of information and stores them in a buffer. TREC's are
+light enough that they may be sprinkled most anywhere in the kernel
+and have very little performance impact.
+
+For instance they can be placed in the scheduler and ISR's to watch
+the interaction between ISR's and the scheduler. They can be placed
+in memory handling routines to determine how and when memory is
+allocated and freed.
+
+In the current default configuration the trec's are dumped by calling
+trec_print_snapshot when die() or panic() are called as well as when
+the kernel itself page faults in do_page_fault.
+
+If CONFIG_MAGIC_SYSRQ is the 'y' command will execute trec_snapshot
+and the 'z' command will print the current snapshot.
+
+A general macro TREC allows trec_write to be invoked as a macro and
+TRECC allows it to be invoked conditionally. See include/linux/trec.h
+for the current set of macros.
+
+2. Architectures Supported
+
+Should support all architectures has been tested only on:
+
+- X86_64
+
+
+3. Configuring
+
+Since trec's are implemented as a device driver they are configured
+by enabling support in the "Device Drivers" section of as they could
+be used early being a module is not supported.
+
+
+4. API Reference
+
+Trec supports the following API:
+
+void trec_init(void):
+
+  Initialize the module, this may be called before the driver is loaded
+  if it is desired to use trec's early.
+
+void trec_write(unsigned long pc, int pid, unsigned long v1, unsigned long v2);
+
+  This is the routine used to write into the buffer. pc is the program counter
+  pid is the process id and v1 and v2 are two parameters.
+
+void trec_snapshot(void);
+
+  Calling this function takes a snapshot of the current trec buffer so that it
+  will not be modified. This is called prior to printing the snapshot via
+  trec_print_snapshot.
+
+void trec_print_snapshot(void);
+
+  Print the snapshot.
+
+5. Overhead
+
+Measured on a 2.4GHZ Core 2 Duo the readings between two TREC's is
+270 tics of the rdtsc or about 0.1us. No attempt has been made to
+optimize and less information can be collected if the overhead
+is still to high.
+
+
+6. TODO
+
+a. Add code to dump trec to user space
+b. Enhance to allow runtime registration and runtime enable disable.
+
-- 
1.5.0.rc2

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


kmem_cache_create loop for find the proper gfporder

2007-03-24 Thread Bin Chen

I have some doubts about the loop to find the gfporder of a cache. For
the code below, its main purpose is to find a gfporder value that can
make the internal fragmentation less that 1/8 of the total slab size.
It is done by increase gfporder for low number to high(possibly 0 to
MAX_GFP_ORDER). But why increase the gfporder(or slab size) can
decrease the internal fragmentation?)

A simple example, suppose the slab management stuff is kept off-slab,
if the gfporder is zero, and the object size in slab is 1000, the
wasted space is 4096 mod 1000 = 96, but with 4096 * 2(increase
gfporder by 1), the space is 8192 mod 1000 = 192, 192 > 96.

Is it right?

By the way, is the first time gfporder is 0? Who initialized it in
cache_cache?

   /* Cal size (in pages) of slabs, and the num of objs per slab.
* This could be made much more intelligent.  For now, try to avoid
* using high page-orders for slabs.  When the gfp() funcs are more
* friendly towards high-order requests, this should be changed.
*/
   do {
   unsigned int break_flag = 0;
cal_wastage:
   kmem_cache_estimate(cachep->gfporder, size, flags,
   &left_over, &cachep->num);
   if (break_flag)
   break;
   if (cachep->gfporder >= MAX_GFP_ORDER)
   break;
   if (!cachep->num)
   goto next;
   if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) {
   /* Oops, this num of objs will cause problems. */
   cachep->gfporder--;
   break_flag++;
   goto cal_wastage;
   }

   /*
* Large num of objs is good, but v. large slabs are currently
* bad for the gfp()s.
*/
   if (cachep->gfporder >= slab_break_gfp_order)
   break;

   if ((left_over*8) <= (PAGE_SIZEgfporder++;
   } while (1);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2.6.20.3] Flush writes to MSI-X table

2007-03-24 Thread Greg KH
On Sat, Mar 24, 2007 at 04:33:41PM -0700, Kok, Auke wrote:
> Greg KH wrote:
> >On Fri, Mar 23, 2007 at 05:28:02PM -0700, Greg KH wrote:
> >>On Fri, Mar 23, 2007 at 05:24:23PM -0700, Williams, Mitch A wrote:
> >>>Greg KH wrote:
> Well, I'm sure you can agree that it is _very_ late in the 2.6.21
> release cycle to expect to get this in for that kernel.  How about
> waiting for 2.6.22 and if it's a big deal, getting it into the
> 2.6.21-stable tree if needed.
> 
> So far I have not seen any bug reports that this patch would fix, have
> you?
> >>>Well, I've seen several bug reports on this issue -- but they're all
> >>>internal to Intel.
> >>>
> >>>However, we do have here a real bug, which shows up on real hardware,
> >>>which will be released soon.  Obviously, I can't discuss release
> >>>schedules, but "soon" is a good word to use.  You might find out more if
> >>>you read The Register (wink, wink).
> >>Ok, but again, as this is something that no one outside of a company can
> >>see, it doesn't really make sense to rush it into the kernel.
> >>
> >>>Given the time frame for release of 2.6.21, I'd be fine with skipping
> >>>2.6.20.x, and putting this in 2.6.21.  But we really don't want to wait
> >>>for 2.6.22.
> >>I think it needs to wait, especially given that there is no public
> >>hardware yet.
> >>
> >>I'll add this to my queue.
> >
> >No, nevermind, I'll wait till it hits linux-pci and gets review from the
> >people there, as there are a _ton_ of other pending MSI patches that you
> >will need to be aware of, as they might conflict with this patch.
> >Please see the linux-pci archives for details of them.
> 
> Actually Mitch and me have been monitoring those and applying them as they 
> came in for the last two months as some of those partially impacted 
> (improved) the issue. The read flush to update the msi-x tables is the only 
> thing missing right now.

Are you including the 21 set MSI patch that want to linux-pci two days
ago?

I don't understand your need to try to rush an api change like this in
so quickly in an area that has a lot of churn and disagreement lately.
_Especially_ so late in the release cycle, and with no hardware publicly
availble.  What is the pressing need here?

thanks,

greg k-h
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2.6.20.3] Flush writes to MSI-X table

2007-03-24 Thread Kok, Auke

Greg KH wrote:

On Fri, Mar 23, 2007 at 05:28:02PM -0700, Greg KH wrote:

On Fri, Mar 23, 2007 at 05:24:23PM -0700, Williams, Mitch A wrote:

Greg KH wrote:

Well, I'm sure you can agree that it is _very_ late in the 2.6.21
release cycle to expect to get this in for that kernel.  How about
waiting for 2.6.22 and if it's a big deal, getting it into the
2.6.21-stable tree if needed.

So far I have not seen any bug reports that this patch would fix, have
you?

Well, I've seen several bug reports on this issue -- but they're all
internal to Intel.

However, we do have here a real bug, which shows up on real hardware,
which will be released soon.  Obviously, I can't discuss release
schedules, but "soon" is a good word to use.  You might find out more if
you read The Register (wink, wink).

Ok, but again, as this is something that no one outside of a company can
see, it doesn't really make sense to rush it into the kernel.


Given the time frame for release of 2.6.21, I'd be fine with skipping
2.6.20.x, and putting this in 2.6.21.  But we really don't want to wait
for 2.6.22.

I think it needs to wait, especially given that there is no public
hardware yet.

I'll add this to my queue.


No, nevermind, I'll wait till it hits linux-pci and gets review from the
people there, as there are a _ton_ of other pending MSI patches that you
will need to be aware of, as they might conflict with this patch.
Please see the linux-pci archives for details of them.


Actually Mitch and me have been monitoring those and applying them as they came 
in for the last two months as some of those partially impacted (improved) the 
issue. The read flush to update the msi-x tables is the only thing missing right 
now.


Auke'
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] fix typo in net/ieee80211/Kconfig

2007-03-24 Thread Patrick Ringl

Hello,
this is just a QA / cosmetic fix .. nevertheless the documentation about 
modules / drivers should be appropriate to the great work of those who 
write all the real important stuff. :-)


The following patch is against 2.6.21-rc4:


--- /root/dev/linux-2.6.20-o/net/ieee80211/Kconfig2007-02-04 
19:44:54.0 +0100
+++ /root/dev/linux-2.6.20/net/ieee80211/Kconfig2007-03-24 
23:43:22.0 +0100

@@ -38,7 +38,7 @@
Include software based cipher suites in support of IEEE
802.11's WEP.  This is needed for WEP as well as 802.1x.

-This can be compiled as a modules and it will be called
+This can be compiled as a module and it will be called
"ieee80211_crypt_wep".

config IEEE80211_CRYPT_CCMP
@@ -51,7 +51,7 @@
(aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with CCMP enabled
networks.

-This can be compiled as a modules and it will be called
+This can be compiled as a module and it will be called
"ieee80211_crypt_ccmp".

config IEEE80211_CRYPT_TKIP
@@ -66,7 +66,7 @@
(aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with TKIP enabled
networks.

-This can be compiled as a modules and it will be called
+This can be compiled as a module and it will be called
"ieee80211_crypt_tkip".

source "net/ieee80211/softmac/Kconfig"

---

regards,
Patrick


--- /root/dev/linux-2.6.20-o/net/ieee80211/Kconfig  2007-02-04 
19:44:54.0 +0100
+++ /root/dev/linux-2.6.20/net/ieee80211/Kconfig2007-03-24 
23:43:22.0 +0100
@@ -38,7 +38,7 @@
Include software based cipher suites in support of IEEE
802.11's WEP.  This is needed for WEP as well as 802.1x.
 
-   This can be compiled as a modules and it will be called
+   This can be compiled as a module and it will be called
"ieee80211_crypt_wep".
 
 config IEEE80211_CRYPT_CCMP
@@ -51,7 +51,7 @@
(aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with CCMP enabled
networks.
 
-   This can be compiled as a modules and it will be called
+   This can be compiled as a module and it will be called
"ieee80211_crypt_ccmp".
 
 config IEEE80211_CRYPT_TKIP
@@ -66,7 +66,7 @@
(aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with TKIP enabled
networks.
 
-   This can be compiled as a modules and it will be called
+   This can be compiled as a module and it will be called
"ieee80211_crypt_tkip".
 
 source "net/ieee80211/softmac/Kconfig"


Linux 2.6.16.45-rc1

2007-03-24 Thread Adrian Bunk
Location:
ftp://ftp.kernel.org/pub/linux/kernel/people/bunk/linux-2.6.16.y/testing/

git tree:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-2.6.16.y.git


Changes since 2.6.16.44:

Adrian Bunk (1):
  Linux 2.6.16.45-rc1

Alexey Dobriyan (1):
  [NET]: Copy mac_len in skb_clone() as well

Bart De Schuymer (1):
  NETFILTER: arp_tables: fix userspace compilation

David S. Miller (1):
  [SPARC64]: Add missing HPAGE_MASK masks on address parameters.

Ed Swierk (1):
  load_module: no BUG if module_subsys uninitialized

Joy Latten (1):
  [XFRM]: Fix missing protocol comparison of larval SAs.

Keith Mannthey (1):
  i386 bootioremap / kexec fix

Masayuki Nakagawa (1):
  [IPV6]: ipv6_fl_socklist is inadvertently shared.

Michał Mirosław (5):
  Fix reference counting (memory leak) problem in __nfulnl_send() and 
callers related to packet queueing.
  [NETFILTER]: nfnetlink_log: fix NULL pointer dereference
  [NETFILTER]: nfnetlink_log: fix possible NULL pointer dereference
  [NETFILTER]: nfnetlink_log: fix reference leak
  [NETFILTER]: nfnetlink_log: fix use after free

Pablo Neira Ayuso (1):
  NETFILTER: ctnetlink: check for status attribute existence on conntrack 
creation

Patrick McHardy (8):
  NETFILTER: Kconfig: fix xt_physdev dependencies
  NETFILTER: Fix iptables ABI breakage on (at least) CRIS
  NETFILTER: nf_conntrack_ipv6: fix crash when handling fragments
  NETFILTER: tcp conntrack: fix IP_CT_TCP_FLAG_CLOSE_INIT value
  NETFILTER: xt_connbytes: fix division by zero
  [NETFILTER]: nf_conntrack: fix incorrect classification of IPv6 fragments 
as ESTABLISHED
  [NETFILTER]: nfnetlink_log: fix crash on bridged packet
  [NETFILTER]: tcp conntrack: accept SYN|URG as valid

Robert Olsson (1):
  [IPV4]: Do not disable preemption in trie_leaf_remove().


 Makefile   |2 -
 arch/i386/mm/boot_ioremap.c|7 ++-
 arch/sparc64/mm/hugetlbpage.c  |   29 ++-
 include/linux/netfilter/nf_conntrack_tcp.h |2 -
 include/linux/netfilter_arp/arp_tables.h   |1 
 include/linux/netfilter_ipv4/ip_tables.h   |2 -
 kernel/module.c|6 +++
 net/core/skbuff.c  |1 
 net/ipv4/fib_trie.c|2 -
 net/ipv4/netfilter/ip_conntrack_netlink.c  |8 ++--
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c|4 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |1 
 net/ipv6/netfilter/nf_conntrack_reasm.c|2 +
 net/ipv6/tcp_ipv6.c|1 
 net/netfilter/Kconfig  |2 -
 net/netfilter/nf_conntrack_netlink.c   |8 ++--
 net/netfilter/nf_conntrack_proto_tcp.c |4 +-
 net/netfilter/nfnetlink_log.c  |   31 -
 net/netfilter/xt_connbytes.c   |   29 ++-
 net/xfrm/xfrm_state.c  |3 +
 20 files changed, 93 insertions(+), 52 deletions(-)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.21-rc4-mm1

2007-03-24 Thread Matt Mackall
On Wed, Mar 21, 2007 at 11:39:17PM -0800, Andrew Morton wrote:
> On Wed, 21 Mar 2007 15:22:25 -0500 Matt Mackall <[EMAIL PROTECTED]> wrote:
> 
> > With the latest -mm, I'm now getting this:
> > 
> > Mar 21 15:06:52 cinder kernel: ipw2200: Detected Intel PRO/Wireless
> > 2200BG Network Connection
> > Mar 21 15:06:52 cinder kernel: firmware_loading_store: unexpected
> > value (0)
> > Mar 21 15:06:52 cinder kernel: ipw2200: ipw2200-bss.fw
> > request_firmware failed:
> > Reason -2
> > Mar 21 15:06:52 cinder kernel: ipw2200: Unable to load firmware: -2
> > Mar 21 15:06:52 cinder kernel: ipw2200: failed to register network
> > device
> 
> The firmware loading bug is caused by
> driver-core-handles-kobject_uevent-failure-while-device_add.patch
> 
> I've uploaded a revert patch to
> ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.21-rc4/2.6.21-rc4-mm1/hot-fixes/

For the record, with the following patches:

# hotfixes
mm-debug-check-for-the-fault-vs-invalidate-race.patch
mm-fix-fault-vs-invalidate-race-for-linear-mappings-fix-2.patch
mm-fix-fault-vs-invalidate-race-for-linear-mappings-fix.patch
revert-driver-core-handles-kobject_uevent-failure-while-device_add.patch
tty-in-tiocsctty-when-we-steal-a-tty-hang-it-up-fix.patch

...I just had a boot where firmware loading failed again. rmmod+insmod
fixed it.

-- 
Mathematics is the supreme nostalgia of our time.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] i386: Prevent early access to TSC to avoid crash on TSCless systems

2007-03-24 Thread Ingo Molnar

* Guillaume Chazarain <[EMAIL PROTECTED]> wrote:

> >+static int tsc_enabled;
> 
> So, now we have tsc_disable, tsc_enabled and tsc_unstable. I can 
> understand the latter, but this lacks orthogonality IMHO.

tsc_disable should be renamed to tsc_disable_override or so, to signal 
that it's only the mirror of the 'notsc' flag the user passed in over 
the boot line.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] i386: Prevent early access to TSC to avoid crash on TSCless systems

2007-03-24 Thread Guillaume Chazarain

+static int tsc_enabled;


So, now we have tsc_disable, tsc_enabled and tsc_unstable.
I can understand the latter, but this lacks orthogonality IMHO.

--
Guillaume
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] i386: Prevent early access to TSC to avoid crash on TSCless systems

2007-03-24 Thread Ingo Molnar

* Thomas Gleixner <[EMAIL PROTECTED]> wrote:

> commit f9690982b8c2f9a2c65acdc113e758ec356676a3 removed the check for 
> cpu_khz from sched_clock(), which prevented early access to the TSC by 
> non obvious magic.
> 
> This is harmless as long as the CPU has a TSC. On TSCless systems this 
> results in an illegal instruction trap.
> 
> Replace tsc_disabled and tsc_unstable by tsc_enabled, which is only 
> set when the tsc is available and not unstable.
> 
> Signed-off-by: Thomas Gleixner <[EMAIL PROTECTED]>

oops, indeed! I think this should also resolve one of the bugs (TSC-less 
Cyrix?) that were reported against -mm some time ago.

Acked-by: Ingo Molnar <[EMAIL PROTECTED]>

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Why is NCQ enabled by default by libata? (2.6.20)

2007-03-24 Thread Robert Hancock

Justin Piszcz wrote:
Without NCQ, performance is MUCH better on almost every operation, with 
the exception of 2-3 items.


/usr/sbin/bonnie++ -d /x/bonnie -s 7952 -m p34 -n 16:10:16:64 > 
run.txt;


# Average of 3 runs with NCQ on for Quad Raptor ADFD 150 RAID 5 Software 
RAID:
p34-ncq-on,7952M,43916.3,96.6667,151943,28.6667,75794.3,18.6667,48991.3,99,181687,24,558.033,0.33,16:10:16/64,867.667,9,29972.7,98.,2801.67,16,890.667,9.3,27743,94.,2115.33,15.6667 

# Average of 3 runs with NCQ off for Quad Raptor ADFD 150 RAID 5 
Software RAID:
p34-ncq-off,7952M,42470,97.,200409,36.,90240.3,22.6667,48656,99,198853,27,546.467,0,16:10:16/64,972.333,10,21833,72.,3697,21,995,10.6667,27901.7,95.6667,2681,20.6667 



http://home.comcast.net/~jpiszcz/ncq_vs_noncq/results.html

In general, for networking, etc, the kernel chooses 'optimized' 
defaults, therefore, I was curious why is NCQ enabled by default?


Normally NCQ is faster, though it depends on the drive firmware. It's 
also possible that software RAID is a case where there are negative 
interactions.


--
Robert Hancock  Saskatoon, SK, Canada
To email, remove "nospam" from [EMAIL PROTECTED]
Home Page: http://www.roberthancock.com/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch 3/3] update ctime and mtime for mmaped write

2007-03-24 Thread Miklos Szeredi
From: Miklos Szeredi <[EMAIL PROTECTED]>

Changes:
v3:
 o rename is_page_modified to test_clear_page_modified
v2:
 o set AS_CMTIME flag in clear_page_dirty_for_io() too
 o don't clear AS_CMTIME in file_update_time()
 o check the dirty bit in the page tables
v1:
 o moved check from __fput() to remove_vma(), which is more logical
 o changed set_page_dirty() to set_page_dirty_mapping in hugetlb.c
 o cleaned up #ifdef CONFIG_BLOCK mess

This patch makes writing to shared memory mappings update st_ctime and
st_mtime as defined by SUSv3:

   The st_ctime and st_mtime fields of a file that is mapped with
   MAP_SHARED and PROT_WRITE shall be marked for update at some point
   in the interval between a write reference to the mapped region and
   the next call to msync() with MS_ASYNC or MS_SYNC for that portion
   of the file by any process. If there is no such call and if the
   underlying file is modified as a result of a write reference, then
   these fields shall be marked for update at some time after the
   write reference.

A new address_space flag is introduced: AS_CMTIME.  This is set each
time a page is dirtied through a userspace memory mapping.  This
includes write accesses via get_user_pages().

Note, the flag is set unconditionally, even if the page is already
dirty.  This is important, because the page might have been dirtied
earlier by a non-mmap write.

This flag is checked in msync() and munmap()/mremap(), and if set, the
file times are updated and the flag is cleared.

Msync also needs to check the dirty bit in the page tables, because
the data might change again after an msync(MS_ASYNC), while the page
is already dirty and read-write.  This also makes the time updating
work for memory backed filesystems such as tmpfs.

This implementation walks the pages in the synced range, and uses rmap
to find all the ptes for each page.  Non-linear vmas are ignored,
since the ptes can only be found by scanning the whole vma, which is
very inefficient.

As an optimization, if dirty pages are accounted, then only walk the
dirty pages, since the clean pages necessarily have clean ptes.  This
doesn't work for memory backed filesystems, where no dirty accounting
is done.

An alternative implementation could check for all intersecting vmas in
the mapping and walk the page tables for each.  This would probably be
more efficient for memory backed filesystems and if the number of
dirty pages is near the total number of pages in the range.

Fixes Novell Bugzilla #206431.

Inspired by Peter Staubach's patch and the resulting comments.

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux-2.6.21-rc4-mm1/include/linux/pagemap.h
===
--- linux-2.6.21-rc4-mm1.orig/include/linux/pagemap.h   2007-03-24 
19:03:11.0 +0100
+++ linux-2.6.21-rc4-mm1/include/linux/pagemap.h2007-03-24 
19:34:30.0 +0100
@@ -19,6 +19,7 @@
  */
 #defineAS_EIO  (__GFP_BITS_SHIFT + 0)  /* IO error on async 
write */
 #define AS_ENOSPC  (__GFP_BITS_SHIFT + 1)  /* ENOSPC on async write */
+#define AS_CMTIME  (__GFP_BITS_SHIFT + 2)  /* ctime/mtime update needed */
 
 static inline void mapping_set_error(struct address_space *mapping, int error)
 {
Index: linux-2.6.21-rc4-mm1/include/linux/mm.h
===
--- linux-2.6.21-rc4-mm1.orig/include/linux/mm.h2007-03-24 
19:04:15.0 +0100
+++ linux-2.6.21-rc4-mm1/include/linux/mm.h 2007-03-24 19:34:30.0 
+0100
@@ -808,6 +808,7 @@ int redirty_page_for_writepage(struct wr
struct page *page);
 int FASTCALL(set_page_dirty(struct page *page));
 int set_page_dirty_lock(struct page *page);
+int set_page_dirty_mapping(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
 extern unsigned long do_mremap(unsigned long addr,
Index: linux-2.6.21-rc4-mm1/mm/memory.c
===
--- linux-2.6.21-rc4-mm1.orig/mm/memory.c   2007-03-24 19:03:11.0 
+0100
+++ linux-2.6.21-rc4-mm1/mm/memory.c2007-03-24 19:34:30.0 +0100
@@ -676,7 +676,7 @@ static unsigned long zap_pte_range(struc
anon_rss--;
else {
if (pte_dirty(ptent))
-   set_page_dirty(page);
+   set_page_dirty_mapping(page);
if (pte_young(ptent))
SetPageReferenced(page);
file_rss--;
@@ -954,7 +954,7 @@ struct page *follow_page(struct vm_area_
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
-   set_page_dirty(page);
+   set_page_dirty_mapping(page);
   

[patch 2/3] only allow nonlinear vmas for ram backed filesystems

2007-03-24 Thread Miklos Szeredi
From: Miklos Szeredi <[EMAIL PROTECTED]>

Dirty page accounting/limiting doesn't work for nonlinear mappings, so
for non-ram backed filesystems emulate with linear mappings.  This
retains ABI compatibility with previous kernels at minimal code cost.

All known users of nonlinear mappings actually use tmpfs, so this
shouldn't have any negative effect.

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux-2.6.21-rc4-mm1/mm/fremap.c
===
--- linux-2.6.21-rc4-mm1.orig/mm/fremap.c   2007-03-24 22:30:05.0 
+0100
+++ linux-2.6.21-rc4-mm1/mm/fremap.c2007-03-24 22:37:59.0 +0100
@@ -181,6 +181,24 @@ asmlinkage long sys_remap_file_pages(uns
goto retry;
}
mapping = vma->vm_file->f_mapping;
+   /*
+* page_mkclean doesn't work on nonlinear vmas, so if dirty
+* pages need to be accounted, emulate with linear vmas.
+*/
+   if (mapping_cap_account_dirty(mapping)) {
+   unsigned long addr;
+
+   flags &= MAP_NONBLOCK;
+   addr = mmap_region(vma->vm_file, start, size, flags,
+  vma->vm_flags, pgoff, 1);
+   if (IS_ERR_VALUE(addr))
+   err = addr;
+   else {
+   BUG_ON(addr != start);
+   err = 0;
+   }
+   goto out;
+   }
spin_lock(&mapping->i_mmap_lock);
flush_dcache_mmap_lock(mapping);
vma->vm_flags |= VM_NONLINEAR;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch 1/3] split mmap

2007-03-24 Thread Miklos Szeredi
From: Miklos Szeredi <[EMAIL PROTECTED]>

This is a straightforward split of do_mmap_pgoff() into two functions:

 - do_mmap_pgoff() checks the parameters, and calculates the vma
   flags.  Then it calls

 - mmap_region(), which does the actual mapping

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/mm/mmap.c
===
--- linux.orig/mm/mmap.c2007-03-24 21:00:40.0 +0100
+++ linux/mm/mmap.c 2007-03-24 22:28:52.0 +0100
@@ -893,14 +893,11 @@ unsigned long do_mmap_pgoff(struct file 
unsigned long flags, unsigned long pgoff)
 {
struct mm_struct * mm = current->mm;
-   struct vm_area_struct * vma, * prev;
struct inode *inode;
unsigned int vm_flags;
-   int correct_wcount = 0;
int error;
-   struct rb_node ** rb_link, * rb_parent;
int accountable = 1;
-   unsigned long charged = 0, reqprot = prot;
+   unsigned long reqprot = prot;
 
/*
 * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1025,7 +1022,25 @@ unsigned long do_mmap_pgoff(struct file 
error = security_file_mmap(file, reqprot, prot, flags);
if (error)
return error;
-   
+
+   return mmap_region(file, addr, len, flags, vm_flags, pgoff,
+  accountable);
+}
+EXPORT_SYMBOL(do_mmap_pgoff);
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ unsigned int vm_flags, unsigned long pgoff,
+ int accountable)
+{
+   struct mm_struct *mm = current->mm;
+   struct vm_area_struct *vma, *prev;
+   int correct_wcount = 0;
+   int error;
+   struct rb_node **rb_link, *rb_parent;
+   unsigned long charged = 0;
+   struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;
+
/* Clear old maps */
error = -ENOMEM;
 munmap_back:
@@ -1174,8 +1189,6 @@ unacct_error:
return error;
 }
 
-EXPORT_SYMBOL(do_mmap_pgoff);
-
 /* Get an address range which is currently unmapped.
  * For shmat() with addr=0.
  *
Index: linux/include/linux/mm.h
===
--- linux.orig/include/linux/mm.h   2007-03-24 21:00:40.0 +0100
+++ linux/include/linux/mm.h2007-03-24 22:28:52.0 +0100
@@ -1035,6 +1035,10 @@ extern unsigned long get_unmapped_area(s
 extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff);
+extern unsigned long mmap_region(struct file *file, unsigned long addr,
+   unsigned long len, unsigned long flags,
+   unsigned int vm_flags, unsigned long pgoff,
+   int accountable);
 
 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch] add file position info to proc

2007-03-24 Thread Miklos Szeredi
From: Miklos Szeredi <[EMAIL PROTECTED]>

This patch adds support for finding out the current file position,
open flags and possibly other info in the future.

These new entries are added:

  /proc/PID/fdinfo/FD
  /proc/PID/task/TID/fdinfo/FD

For each fd the information is provided in the following format:

pos:1234
flags:  012

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/fs/proc/base.c
===
--- linux.orig/fs/proc/base.c   2007-03-24 19:00:48.0 +0100
+++ linux/fs/proc/base.c2007-03-24 22:28:14.0 +0100
@@ -1199,7 +1199,10 @@ out:
return ~0U;
 }
 
-static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct 
vfsmount **mnt)
+#define PROC_FDINFO_MAX 64
+
+static int proc_fd_info(struct inode *inode, struct dentry **dentry,
+   struct vfsmount **mnt, char *info)
 {
struct task_struct *task = get_proc_task(inode);
struct files_struct *files = NULL;
@@ -1218,8 +1221,16 @@ static int proc_fd_link(struct inode *in
spin_lock(&files->file_lock);
file = fcheck_files(files, fd);
if (file) {
-   *mnt = mntget(file->f_path.mnt);
-   *dentry = dget(file->f_path.dentry);
+   if (mnt)
+   *mnt = mntget(file->f_path.mnt);
+   if (dentry)
+   *dentry = dget(file->f_path.dentry);
+   if (info)
+   snprintf(info, PROC_FDINFO_MAX,
+"pos:\t%lli\n"
+"flags:\t0%o\n",
+(long long) file->f_pos,
+file->f_flags);
spin_unlock(&files->file_lock);
put_files_struct(files);
return 0;
@@ -1230,6 +1241,12 @@ static int proc_fd_link(struct inode *in
return -ENOENT;
 }
 
+static int proc_fd_link(struct inode *inode, struct dentry **dentry,
+   struct vfsmount **mnt)
+{
+   return proc_fd_info(inode, dentry, mnt, NULL);
+}
+
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
struct inode *inode = dentry->d_inode;
@@ -1325,7 +1342,9 @@ out_iput:
goto out;
 }
 
-static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * 
dentry, struct nameidata *nd)
+static struct dentry *proc_lookupfd_common(struct inode *dir,
+  struct dentry *dentry,
+  instantiate_t instantiate)
 {
struct task_struct *task = get_proc_task(dir);
unsigned fd = name_to_int(dentry);
@@ -1336,23 +1355,15 @@ static struct dentry *proc_lookupfd(stru
if (fd == ~0U)
goto out;
 
-   result = proc_fd_instantiate(dir, dentry, task, &fd);
+   result = instantiate(dir, dentry, task, &fd);
 out:
put_task_struct(task);
 out_no_task:
return result;
 }
 
-static int proc_fd_fill_cache(struct file *filp, void *dirent, filldir_t 
filldir,
-   struct task_struct *task, int fd)
-{
-   char name[PROC_NUMBUF];
-   int len = snprintf(name, sizeof(name), "%d", fd);
-   return proc_fill_cache(filp, dirent, filldir, name, len,
-   proc_fd_instantiate, task, &fd);
-}
-
-static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
+static int proc_readfd_common(struct file * filp, void * dirent,
+ filldir_t filldir, instantiate_t instantiate)
 {
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
@@ -1388,12 +1399,17 @@ static int proc_readfd(struct file * fil
for (fd = filp->f_pos-2;
 fd < fdt->max_fds;
 fd++, filp->f_pos++) {
+   char name[PROC_NUMBUF];
+   int len;
 
if (!fcheck_files(files, fd))
continue;
rcu_read_unlock();
 
-   if (proc_fd_fill_cache(filp, dirent, filldir, 
p, fd) < 0) {
+   len = snprintf(name, sizeof(name), "%d", fd);
+   if (proc_fill_cache(filp, dirent, filldir,
+   name, len, instantiate,
+   p, &fd) < 0) {
rcu_read_lock();
break;
}
@@ -1408,6 +1424,32 @@ out_no_task:
return retval;
 }
 
+static struct dentry *proc_lookupfd(struct inode *dir, struct dentr

[patch 3/3] balance dirty pages from loop device

2007-03-24 Thread Miklos Szeredi
From: Miklos Szeredi <[EMAIL PROTECTED]>

The function do_lo_send_aops() should call
balance_dirty_pages_ratelimited() after each page similarly to
generic_file_buffered_write().

Without this, writing the loop device directly (not through a
filesystem) is very slow, and also slows the whole system down,
because nr_dirty is constantly over the limit.

Beware: this patch without the fix to balance_dirty_pages() makes a
loopback mounted filesystem prone to deadlock.

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/drivers/block/loop.c
===
--- linux.orig/drivers/block/loop.c 2007-03-24 21:00:40.0 +0100
+++ linux/drivers/block/loop.c  2007-03-24 22:07:06.0 +0100
@@ -275,6 +275,8 @@ static int do_lo_send_aops(struct loop_d
pos += size;
unlock_page(page);
page_cache_release(page);
+   balance_dirty_pages_ratelimited(mapping);
+   cond_resched();
}
ret = 0;
 out:
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch 2/3] remove throttle_vm_writeout()

2007-03-24 Thread Miklos Szeredi
From: Miklos Szeredi <[EMAIL PROTECTED]>

Remove this function.  It's purpose was to limit the global number of
writeback pages from submitted by direct reclaim.  But this is equally
well accomplished by limited queue lengths.  When this function was
added, the device queues had much larger default lengths (8192
requests, now it's 128), causing problems.

When writable shared mapping support is added to fuse, this function
would be able to cause a deadlock if the userspace filesystem needs to
allocate memory while writing back dirty pages.

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/include/linux/writeback.h
===
--- linux.orig/include/linux/writeback.h2007-03-24 22:07:00.0 
+0100
+++ linux/include/linux/writeback.h 2007-03-24 22:28:52.0 +0100
@@ -85,7 +85,6 @@ static inline void wait_on_inode(struct 
 int wakeup_pdflush(long nr_pages);
 void laptop_io_completion(void);
 void laptop_sync_completion(void);
-void throttle_vm_writeout(gfp_t gfp_mask);
 
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2007-03-24 22:07:00.0 +0100
+++ linux/mm/page-writeback.c   2007-03-24 22:28:52.0 +0100
@@ -312,37 +312,6 @@ void balance_dirty_pages_ratelimited_nr(
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
-void throttle_vm_writeout(gfp_t gfp_mask)
-{
-   long background_thresh;
-   long dirty_thresh;
-
-   if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
-   /*
-* The caller might hold locks which can prevent IO completion
-* or progress in the filesystem.  So we cannot just sit here
-* waiting for IO to complete.
-*/
-   congestion_wait(WRITE, HZ/10);
-   return;
-   }
-
-for ( ; ; ) {
-   get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
-
-/*
- * Boost the allowable dirty threshold a bit for page
- * allocators so they don't get DoS'ed by heavy writers
- */
-dirty_thresh += dirty_thresh / 10;  /* wh... */
-
-if (global_page_state(NR_UNSTABLE_NFS) +
-   global_page_state(NR_WRITEBACK) <= dirty_thresh)
-   break;
-congestion_wait(WRITE, HZ/10);
-}
-}
-
 /*
  * writeback at least _min_pages, and keep writing until the amount of dirty
  * memory is less than the background threshold, or until we're all clean.
Index: linux/mm/vmscan.c
===
--- linux.orig/mm/vmscan.c  2007-03-24 22:06:53.0 +0100
+++ linux/mm/vmscan.c   2007-03-24 22:07:03.0 +0100
@@ -952,8 +952,6 @@ static unsigned long shrink_zone(int pri
}
}
 
-   throttle_vm_writeout(sc->gfp_mask);
-
atomic_dec(&zone->reclaim_in_progress);
return nr_reclaimed;
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] i386: Prevent early access to TSC to avoid crash on TSCless systems

2007-03-24 Thread Thomas Gleixner
commit f9690982b8c2f9a2c65acdc113e758ec356676a3 removed the check for
cpu_khz from sched_clock(), which prevented early access to the TSC by
non obvious magic.

This is harmless as long as the CPU has a TSC. On TSCless systems this
results in an illegal instruction trap.

Replace tsc_disabled and tsc_unstable by tsc_enabled, which is only set
when the tsc is available and not unstable.

Signed-off-by: Thomas Gleixner <[EMAIL PROTECTED]>

diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 0e65f7a..6cb8f53 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -18,6 +18,8 @@
 
 #include "mach_timer.h"
 
+static int tsc_enabled;
+
 /*
  * On some systems the TSC frequency does not
  * change with the cpu frequency. So we need
@@ -105,7 +107,7 @@ unsigned long long sched_clock(void)
/*
 * Fall back to jiffies if there's no TSC available:
 */
-   if (tsc_unstable || unlikely(tsc_disable))
+   if (unlikely(!tsc_enabled))
/* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (10 / HZ);
 
@@ -283,6 +285,7 @@ void mark_tsc_unstable(void)
 {
if (!tsc_unstable) {
tsc_unstable = 1;
+   tsc_enabled = 0;
/* Can be called before registration */
if (clocksource_tsc.mult)
clocksource_change_rating(&clocksource_tsc, 0);
@@ -383,7 +386,9 @@ void __init tsc_init(void)
if (check_tsc_unstable()) {
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
-   }
+   } else
+   tsc_enabled = 1;
+
clocksource_register(&clocksource_tsc);
 
return;




-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch 1/3] fix illogical behavior in balance_dirty_pages()

2007-03-24 Thread Miklos Szeredi
This is a slightly different take on the fix for the deadlock in fuse
with dirty balancing.  David Chinner convinced me, that per-bdi
counters are too expensive, and that it's not worth trying to account
the number of pages under writeback, as they will be limited by the
queue anyway.


From: Miklos Szeredi <[EMAIL PROTECTED]>

Current behavior of balance_dirty_pages() is to try to start writeout
into the specified queue for at least "write_chunk" number of pages.
If "write_chunk" pages have been submitted, then return.

However if there are less than "write_chunk" dirty pages for this
queue, then it doesn't return, waiting for the global dirty+writeback
counters to subside, but without doing any actual work.

This is illogical behavior: it allows more dirtyings while there are
dirty pages, but stops further dirtying completely if there are no
more dirty pages.

It also makes a deadlock possible when one filesystem is writing data
through another, and the balance_dirty_pages() for the lower
filesystem is stalling the writeback for the upper filesystem's
data (*).

So the exit condition should instead be:

  submitted at least "write_chunk" number of pages
OR
submitted ALL the dirty pages destined for this backing dev
  AND
backing dev is not congested

To do this, introduce a new counter in writeback_control, which counts
the number of dirty pages encountered during writeback.  This includes
all dirty pages, even those which are already under writeback but have
been dirtied again, and those which have been skipped due to having
locked buffers.

If this counter is zero after trying to submit some pages for
writeback, and the backing dev is uncongested, then don't wait any
more.  After this, newly dirtied pages can quickly be written back to
this backing dev.

If there are globally no more pages to submit for writeback
(nr_reclaimable == 0), then also don't wait for ever, only while this
backing dev is congested.

(*) For more info on this deadlock, see the following discussions:

  http://lkml.org/lkml/2007/3/1/9
  http://lkml.org/lkml/2007/3/12/16

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/include/linux/writeback.h
===
--- linux.orig/include/linux/writeback.h2007-03-24 22:06:56.0 
+0100
+++ linux/include/linux/writeback.h 2007-03-24 22:29:02.0 +0100
@@ -44,6 +44,7 @@ struct writeback_control {
long nr_to_write;   /* Write this many pages, and decrement
   this for each page written */
long pages_skipped; /* Pages which were not written */
+   long nr_dirty;  /* Number of dirty pages encountered */
 
/*
 * For a_ops->writepages(): is start or end are non-zero then this is
Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2007-03-24 22:06:56.0 +0100
+++ linux/mm/page-writeback.c   2007-03-24 22:29:02.0 +0100
@@ -207,7 +207,15 @@ static void balance_dirty_pages(struct a
 * written to the server's write cache, but has not yet
 * been flushed to permanent storage.
 */
-   if (nr_reclaimable) {
+   if (!nr_reclaimable) {
+   /*
+* If there's nothing more to write back and this queue
+* is uncongested,  then it is possible to quickly
+* write out some more data, so let's not wait
+*/
+   if (!bdi_write_congested(bdi))
+   break;
+   } else {
writeback_inodes(&wbc);
get_dirty_limits(&background_thresh,
&dirty_thresh, mapping);
@@ -220,6 +228,14 @@ static void balance_dirty_pages(struct a
pages_written += write_chunk - wbc.nr_to_write;
if (pages_written >= write_chunk)
break;  /* We've done our duty */
+
+   /*
+* If there are no more dirty pages for this backing
+* backing dev, and the queue is not congested, then
+* it is possible to quickly write out some more data
+*/
+   if (!wbc.nr_dirty && !bdi_write_congested(bdi))
+   break;
}
congestion_wait(WRITE, HZ/10);
}
@@ -619,6 +635,7 @@ retry:
  min(end - index, 
(pgoff_t)PAGEVEC_SIZE-1) + 1))) {
unsigned i;
 
+   wbc->nr_dirty += nr_pages;
scanned = 1;
for (i = 0; i < nr_pages; i++) {
  

Re: [rfc][patch] queued spinlocks (i386)

2007-03-24 Thread Andrew Morton
> On Fri, 23 Mar 2007 11:32:44 +0100 Nick Piggin <[EMAIL PROTECTED]> wrote:
>
> I'm not as concerned about the contended performance of spinlocks
>

The contended case matters.  Back in 2.5.something I screwed up the debug
version of one of the locks (rwlock, iirc) - it was simply missing a
cpu_relax(), and some people's benchmarks halved.

> This was just something I had in mind when the hardware lock
> starvation issue came up

It looks like a good way to address the lru_lock starvation/capture
problem.  But I think I'd be more comfortable if we were to introduce it as
a new lock type, rather than as a reimplementation of the existing
spin_lock().   Initially, at least.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm snapshot broken-out-2007-03-24-00-14.tar.gz uploaded

2007-03-24 Thread David Miller
From: Andrew Morton <[EMAIL PROTECTED]>
Date: Sat, 24 Mar 2007 10:16:53 -0800

> On Sat, 24 Mar 2007 18:18:42 +0100 "Michal Piotrowski" <[EMAIL PROTECTED]> 
> wrote:
> 
> > On 24/03/07, [EMAIL PROTECTED] <[EMAIL PROTECTED]> wrote:
> > > The mm snapshot broken-out-2007-03-24-00-14.tar.gz has been uploaded to
> > >
> > >
> > > ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/mm/broken-out-2007-03-24-00-14.tar.gz
> > >
> > 
> > My network doesn't work
> > "RTNETLINK answers: Invalid argument"
> > 
> > git-net* patches?
> > 
> 
> There's a huge collision between git-net and git-wireless which I haven't
> started to look at yet.  I suggest you forget about that particular
> snapshot.

We fixed that bug about a day after you likely took a snapshot
of the net-2.6.22 tree.

Here is the fix:

commit 65f96c1f4b549a3c3c19cebf9f0795c0d8fb35f0
Author: Thomas Graf <[EMAIL PROTECTED]>
Date:   Thu Mar 22 21:41:06 2007 -0700

[RTNL]: Properly return rntl message handler

Signed-off-by: Thomas Graf <[EMAIL PROTECTED]>
Signed-off-by: David S. Miller <[EMAIL PROTECTED]>

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 35ce9f7..3a295e3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -122,10 +122,10 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int 
msgindex)
struct rtnl_link *tab;
 
tab = rtnl_msg_handlers[protocol];
-   if (tab == NULL || tab->doit == NULL)
+   if (tab == NULL || tab[msgindex].doit == NULL)
tab = rtnl_msg_handlers[PF_UNSPEC];
 
-   return tab ? tab->doit : NULL;
+   return tab ? tab[msgindex].doit : NULL;
 }
 
 static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
@@ -133,10 +133,10 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int 
msgindex)
struct rtnl_link *tab;
 
tab = rtnl_msg_handlers[protocol];
-   if (tab == NULL || tab->dumpit == NULL)
+   if (tab == NULL || tab[msgindex].dumpit == NULL)
tab = rtnl_msg_handlers[PF_UNSPEC];
 
-   return tab ? tab->dumpit : NULL;
+   return tab ? tab[msgindex].dumpit : NULL;
 }
 
 /**
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [6/6] 2.6.21-rc4: known regressions

2007-03-24 Thread David Miller
From: David Miller <[EMAIL PROTECTED]>
Date: Mon, 19 Mar 2007 19:38:29 -0700 (PDT)

> From: Adrian Bunk <[EMAIL PROTECTED]>
> Date: Sun, 18 Mar 2007 19:49:38 +0100
> 
> > Subject: ipv6 crash
> > References : http://lkml.org/lkml/2007/3/10/2
> > Submitter  : Len Brown <[EMAIL PROTECTED]>
> > Status : unknown
> 
> This is caused by some problem in the router round-robin code in
> net/ipv6/route.c:rt6_select()
 ...
> I'll see if I can come up with something to fix this properly.

Here is the fix I came up with and just posted to netdev for
a quick review, I'll push this to the appropriate places soon
if nobody spots any problems in it.

commit 4c68db63b8314df3cf30b7fe595a1b8935bb2cb0
Author: David S. Miller <[EMAIL PROTECTED]>
Date:   Sat Mar 24 12:06:32 2007 -0700

[IPV6]: Fix routing round-robin locking.

As per RFC2461, section 6.3.6, item #2, when no routers on the
matching list are known to be reachable or probably reachable we
do round robin on those available routes so that we make sure
to probe as many of them as possible to detect when one becomes
reachable faster.

Each routing table has a rwlock protecting the tree and the linked
list of routes at each leaf.  The round robin code executes during
lookup and thus with the rwlock taken as a reader.  A small local
spinlock tries to provide protection but this does not work at all
for two reasons:

1) The round-robin list manipulation, as coded, goes like this (with
   read lock held):

walk routes finding head and tail

spin_lock();
rotate list using head and tail
spin_unlock();

   While one thread is rotating the list, another thread can
   end up with stale values of head and tail and then proceed
   to corrupt the list when it gets the lock.  This ends up causing
   the OOPS in fib6_add() later onthat many people have been hitting.

2) All the other code paths that run with the rwlock held as
   a reader do not expect the list to change on them, they
   expect it to remain completely fixed while they hold the
   lock in that way.

So, simply stated, it is impossible to implement this correctly using
a manipulation of the list without violating the rwlock locking
semantics.

Reimplement using a per-fib6_node round-robin pointer.  This way we
don't need to manipulate the list at all, and since the round-robin
pointer can only ever point to real existing entries we don't need
to perform any locking on the changing of the round-robin pointer
itself.  We only need to reset the round-robin pointer to NULL when
the entry it is pointing to is removed.

The idea is from Thomas Graf and it is very similar to how this
was implemented before the advanced router selection code when in.

Signed-off-by: David S. Miller <[EMAIL PROTECTED]>

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 9eda572..cf355a3 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -58,6 +58,7 @@ struct fib6_node
__u16   fn_bit; /* bit key */
__u16   fn_flags;
__u32   fn_sernum;
+   struct rt6_info *rr_ptr;
 };
 
 #ifndef CONFIG_IPV6_SUBTREES
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f4d7be7..c46f909 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1109,6 +1109,10 @@ static void fib6_del_route(struct fib6_node *fn, struct 
rt6_info **rtp,
rt6_stats.fib_rt_entries--;
rt6_stats.fib_discarded_routes++;
 
+   /* Reset round-robin state, if necessary */
+   if (fn->rr_ptr == rt)
+   fn->rr_ptr = NULL;
+
/* Adjust walkers */
read_lock(&fib6_walker_lock);
FOR_WALKERS(w) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a6b3117..3931b33 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -363,55 +363,76 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
return m;
 }
 
-static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
-  int strict)
+static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
+  int *mpri, struct rt6_info *match)
 {
-   struct rt6_info *match = NULL, *last = NULL;
-   struct rt6_info *rt, *rt0 = *head;
-   u32 metric;
+   int m;
+
+   if (rt6_check_expired(rt))
+   goto out;
+
+   m = rt6_score_route(rt, oif, strict);
+   if (m < 0)
+   goto out;
+
+   if (m > *mpri) {
+   if (strict & RT6_LOOKUP_F_REACHABLE)
+   rt6_probe(match);
+   *mpri = m;
+   match = rt;
+   } else if (strict & RT6_LOOKUP_F_REACHABLE) {
+   rt6_probe(rt);
+   }
+
+out:
+   return match;
+}
+
+static struct rt6_info *find_rr_leaf(s

Re: [linux-usb-devel] [PATCH 0/2] [SERIAL] [USB] fixed to skip NULL entry in struct serial usb_serial_port.

2007-03-24 Thread Greg KH
On Sun, Mar 25, 2007 at 12:52:27AM +0900, Noriaki TAKAMIYA wrote:
> Hi,
> 
>   When I boot using linux-2.6.21-rc4 on ThinkPad T41 with pl2303 USB
>   serial device plugged in, the kernel crashes.
> 
>   The reason is struct usb_serial_port is referenced without checking
>   whether it is NULL or not.

This should already be fixed in the -git snapshots that have come out
after 2.6.21-rc4.  Can you test them to verify this?

thanks,

greg k-h
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [3/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread Thomas Meyer
Eric W. Biederman schrieb:
>
> Odd.  I would have thought the oops happened in the first resume, not
> the second. 
>
> Hmm.  It may have something to do with the ``managed'' driver
> aspect of this as well..
>   
No. I don't think so. The problem is caused by this sequence: (the info
is always before entry of a function and before the exit of a function):

1.) Normal boot
[kernel] ahci :00:1f.2: version 2.1
[kernel] pci_enable_device: dev= c1a59000
[kernel] pci_enable_device: irq= 0
[kernel] pci_enable_device: msi_enabled= 0
[kernel] PCI: Enabling device :00:1f.2 (0005 -> 0007)
[kernel] ACPI: PCI Interrupt :00:1f.2[B] -> GSI 19 (level, low) ->
IRQ 19
[kernel] pci_enable_device: dev= c1a59000
[kernel] pci_enable_device: irq= 19
[kernel] pci_enable_device: msi_enabled= 0

2.) msi irq 218 gets assigned

3) First suspend to disk. Consists of
3a) Suspend devices
[kernel] ahci :00:1f.2: freeze
[kernel] pci_disable_device: dev= c1a59000
[kernel] pci_disable_device: irq= 218
[kernel] pci_disable_device: msi_enabled= 1
[kernel] ACPI: PCI interrupt for device :00:1f.2 disabled
[kernel] pci_disable_device: dev= c1a59000
[kernel] pci_disable_device: irq= 218
[kernel] pci_disable_device: msi_enabled= 1

3b) Disable non-boot cpus
3c) Snapshot memory
3d) Enable non-boot cpus
3e) Resume devices (after snapshot!)
[kernel] ahci :00:1f.2: resuming
[kernel] PM: Writing back config space on device :00:1f.2 at offset
1 (was 2b00403, writing 2b00407)
[kernel] pci_enable_device: dev= c1a59000
[kernel] pci_enable_device: irq= 218
[kernel] pci_enable_device: msi_enabled= 1
[kernel] ACPI: PCI Interrupt :00:1f.2[B] -> GSI 19 (level, low) ->
IRQ 19
[kernel] pci_enable_device: dev= c1a59000
[kernel] pci_enable_device: irq= 19
[kernel] pci_enable_device: msi_enabled= 1

3f) Write memory image
3g) Power down + reboot

4a) Normal start and restore memory image
4b) Enable non-boot cpus
4c) Resume devices
[kernel] ahci :00:1f.2: resuming
[kernel] PM: Writing back config space on device :00:1f.2 at offset
1 (was 2b00403, writing 2b00407)
[kernel] pci_enable_device: dev= c1a59000
[kernel] pci_enable_device: irq= 218
[kernel] pci_enable_device: msi_enabled= 1
[kernel] ACPI: PCI Interrupt :00:1f.2[B] -> GSI 19 (level, low) ->
IRQ 19
[kernel] pci_enable_device: dev= c1a59000
[kernel] pci_enable_device: irq= 19
[kernel] pci_enable_device: msi_enabled= 1
Now the system is running with irq=19 and msi enabled=1. So let's
suspend again:

5) Second suspend to disk consists of
5a) Suspend devices
[kernel] ahci :00:1f.2: freeze
[kernel] pci_disable_device: dev= c1a59000
[kernel] pci_disable_device: irq= 19
[kernel] pci_disable_device: msi_enabled= 1
[kernel] ACPI: PCI interrupt for device :00:1f.2 disabled
[kernel] pci_disable_device: dev= c1a59000
[kernel] pci_disable_device: irq= 19
[kernel] pci_disable_device: msi_enabled= 1

5b) Disable non-boot cpus
5c) Snapshot memory
5d) Enable non-boot cpus
5e) Resume devices
[kernel] pci_enable_device: dev= c1a59000
[kernel] pci_enable_device: irq= 19
[kernel] pci_enable_device: msi_enabled= 1

-> OOPS in restore_msi because it tries to access msi structure for irq
19 and not 218.

So i guess this has nothing to do with the managed pci functions?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: race condition in dm-crypt?

2007-03-24 Thread Kasper Sandberg
On Fri, 2007-03-23 at 21:41 +0100, Christoph Maier wrote:
> Jan C. Nordholz wrote:
> > I think I'm experiencing a race condition: Irregularly my kernel runs
> > into an Oops when it tries to initialize my crypt containers.
> 
> FYI, there are similiar reports on the net, going as far back as May 2006:
> http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/1636 
> is the oldest one I could find.
> 
> Bugzilla entry: http://bugzilla.kernel.org/show_bug.cgi?id=7388
> 
> I, too, ran into the bug and failed to reproduce it. However, it might 
> be worth knowing that the system went to 100% iowait afterwards.
Very interresting actually. I myself run dm-crypt and somewhat regularly
my io stops for 5-10 seconds, with seemingly no errors or high load, io
just stalls, and then returns after a while.

> 
> Regards, Christoph Maier
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux-VServer example results for sharing vs. separate mappings ...

2007-03-24 Thread Andrew Morton
On Sat, 24 Mar 2007 19:38:06 +0100 Herbert Poetzl <[EMAIL PROTECTED]> wrote:

> On Fri, Mar 23, 2007 at 09:42:35PM -0800, Andrew Morton wrote:
> > On Fri, 23 Mar 2007 20:30:00 +0100 Herbert Poetzl <[EMAIL PROTECTED]> wrote:
> > 
> > > 
> > > Hi Eric!
> > > Hi Folks!
> > > 
> > > here is a real world example result from one of my tests
> > > regarding the benefit of sharing over separate memory
> > > 
> > > the setup is quite simple, a typical machine used by
> > > providers all over the world, a dual Pentium D 3.2GHz
> > > with 4GB of memory and a single 160GB SATA disk running
> > > a Linux-VServer kernel (2.6.19.7-vs2.2.0-rc18)
> > > 
> > > the Guest systems used are Mandriva 2007 guests with
> > > syslog, crond, sshd, apache, postfix and postgresql
> > > installed and running (all in all 17 processes per guest)
> > > 
> > > the disk space used by one guests is roughly 148MB
> > > 
> > > in addition to that, a normal host system is running
> > > with a few daemons (like sshd, httpd, postfix ...)
> > > 
> > > 
> > > the first test setup is starting 200 of those guests
> > > one after the other and measuring the memory usage
> > > before and after the guest did start, as well as 
> > > recording the time used to start them ...
> > > 
> > > this is done right after the machine was rebooted, in
> > > one test with 200 separate guests (i.e. 200 x 148MB) 
> > > and in a second run with 200 unified guests (which
> > > means roughly 138MB of shared files)
> > 
> > Please define your terms.  
> > What is a "separated guest", what is a "unified guest" 
> > and how do they differ?
> 
> separated guests are complete Linux Distributions which
> do not share (filesystem wise) anything with any other
> guest ... i.e. all files and executables have to be
> paged in and get separate mappings (and thus separate
> memory)
> 
> unified guests use a mechanism we (Linux-VServer) call
> 'unification' which can be considered an advanced form
> of hard linking (i.e. we add special flags to protect
> those hard links from modification. such a file is 
> copied on demand (CoW Link Breaking) on the first attempt
> to be modified (attributes or content)

OK.

> > If a "separated" guest is something in which separate 
> > guests will use distinct physical pages to cache the 
> > contents of /etc/passwd (ie: a separate filesystem 
> > per guest) then I don't think that's interesting 
> > information, frankly.
> 
> well, you didn't bother to answer my questions regarding
> your suggested approach yet,

Have been a bit distracted lately, and these discussions seem to go on an
on without ever converging.

> and as I am concerned that
> some of the suggested approaches sacrifice performance
> and resource sharing/efficiency for simplicity or (as
> we recently had) 'ability to explain it to the customer'

The problem is memory reclaim.  A number of schemes which have been
proposed require a per-container page reclaim mechanism - basically a
separate scanner.

This is a huge, huge, huge problem.  The present scanner has been under
development for over a decade and has had tremendous amounts of work and
testing put into it.  And it still has problems.  But those problems will
be gradually addressed.

A per-container recaim scheme really really really wants to reuse all that
stuff rather than creating a separate, parallel, new scanner which has the
same robustness requirements, only has a decade less test and development
done on it.  And which permanently doubles our maintenance costs.

So how do we reuse our existing scanner?  With physical containers.  One
can envisage several schemes:

a) slice the machine into 128 fake NUMA nodes, use each node as the
   basic block of memory allocation, manage the binding between these
   memory hunks and process groups with cpusets.

   This is what google are testing, and it works.

b) Create a new memory abstraction, call it the "software zone", which
   is mostly decoupled from the present "hardware zones".  Most of the MM
   is reworked to use "software zones".  The "software zones" are
   runtime-resizeable, and obtain their pages via some means from the
   hardware zones.  A container uses a software zone.

c) Something else, similar to the above.  Various schemes can be
   envisaged, it isn't terribly important for this discussion.


Let me repeat: this all has a huge upside in that it reuses the existing
page reclaimation logic.  And cpusets.  Yes, we do discover glitches, but
those glitches (such as Christoph's recent discovery of suboptimal
interaction between cpusets and the global dirty ratio) get addressed, and
we tend to strengthen the overall MM system as we address them.


So what are the downsides?  I think mainly the sharing issue:

> > The issue with pagecache (afaik) is that if we use 
> > containers based on physical pages (an approach which 
> > is much preferred by myself) then we can get in a 
> > situation where a pagecache page is physically in 
> > container A, is not actually

Re: I/O memory barriers vs SMP memory barriers

2007-03-24 Thread Benjamin Herrenschmidt
On Fri, 2007-03-23 at 13:43 +, David Howells wrote:
> [Resend - this time with a comma in the addresses, not a dot]
> 
> Lennert Buytenhek <[EMAIL PROTECTED]> wrote:
> 
> > [ background: On ARM, SMP synchronisation does need barriers but device
> >   synchronisation does not.  The question is that given this, whether
> >   mb() and friends can be NOPs on ARM or not (i.e. whether mb() is
> >   supposed to sync against other CPUs or not, or whether only smp_mb()
> >   can be used for this.)  ]
> 
> H...
> 
> I see your problem.  I think the right way to deal with this is to get rid of
> mb(), rmb(), wmb() and read_barrier_depends() and replace them with io_mb(),
> io_rmb(), ...

Hrm... I'm not sure I like the io_* name, I think it's even more
confusing, people will never know when to use what ...

Maybe we should dig out again my attempt at properly defining semantics
of IO accessors and related barriers and extend it to include CPU vs.
DMA barriers.

Ben.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ckrm-tech] [PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code

2007-03-24 Thread Paul Jackson
> IMO, we need to use task_lock() in container_exit() to avoid this race.
> 
> (I think this race already exists in mainline cpuset.c?)
> 
> P.S : cpuset.c checks for PF_EXITING twice in attach_task(), while this
> patch seems to be checking only once. Is that fine?

I think the cpuset code is ok, because, as you note, it locks the task,
picks off the cpuset pointer, and then checks a second time that the
task still does not have PF_EXITING set:

In the kernel/cpuset.c code for attach_task():

task_lock(tsk);
oldcs = tsk->cpuset;
/*
 * After getting 'oldcs' cpuset ptr, be sure still not exiting.
 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
 * then fail this attach_task(), to avoid breaking top_cpuset.count.
 */
if (tsk->flags & PF_EXITING) {
task_unlock(tsk);
mutex_unlock(&callback_mutex);
put_task_struct(tsk);
return -ESRCH;
}

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86_64: avoid sending LOCAL_TIMER_VECTOR IPI to itself

2007-03-24 Thread Ingo Molnar

* Ray Lee <[EMAIL PROTECTED]> wrote:

> Subject: [PATCH] x86_64: avoid sending LOCAL_TIMER_VECTOR IPI to 
> itself
> 
> Ray Lee reported, that on an UP kernel with "noapic" command line 
> option set, the box locks hard during boot.

i think this bug deserves a bit more attention, because similar problems 
could be in other codepaths too.

the problem here is that we tried to send an IPI to ourselves - which 
confused Ray's system which has an IO-APIC, but where due to noapic we 
keep the IO-APIC in its BIOS default.

this isnt a new problem: the new time code just exposed it more 
prominently that it was visible before. (the SMP kernel probably would 
hang in a similar way on Ray's system)

i dont see any clear debugging in the IPI code that excludes self-IPIs. 
I think the only valid way to do that is to use DEST_SELF. Andi?

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [rfc][patch] queued spinlocks (i386)

2007-03-24 Thread Nikita Danilov
Ingo Molnar writes:
 > 
 > * Nikita Danilov <[EMAIL PROTECTED]> wrote:
 > 
 > > Indeed, this technique is very well known. E.g., 
 > > http://citeseer.ist.psu.edu/anderson01sharedmemory.html has a whole 
 > > section (3. Local-spin Algorithms) on them, citing papers from the 
 > > 1990 onward.
 > 
 > that is a cool reference! So i'd suggest to do (redo?) the patch based 
 > on those concepts and that terminology and not use 'queued spinlocks' 

There is some old version:

http://namesys.com/pub/misc-patches/unsupported/extra/2004.02.04/p06-locallock.patch
http://namesys.com/pub/misc-patches/unsupported/extra/2004.02.04/p07-locallock-bkl.patch
http://namesys.com/pub/misc-patches/unsupported/extra/2004.02.04/p08-locallock-zone.patch

http://namesys.com/pub/misc-patches/unsupported/extra/2004.02.04/p0b-atomic_dec_and_locallock.patch
http://namesys.com/pub/misc-patches/unsupported/extra/2004.02.04/p0c-locallock-dcache.patch

This version retains original spin-lock interface (i.e., no additional
"queue link" pointer is passed to the locking function). As a result,
lock data structure contains an array of NR_CPU counters, so it's only
suitable for global statically allocated locks.

 > that are commonly associated with MS's stuff. And as a result the 
 > contended case would be optimized some more via local-spin algorithms. 
 > (which is not a key thing for us, but which would be nice to have 
 > nevertheless)
 > 
 >  Ingo

Nikita.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [3/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread Eric W. Biederman
Thomas Meyer <[EMAIL PROTECTED]> writes:

> Eric W. Biederman schrieb:
>> Thomas Meyer <[EMAIL PROTECTED]> writes:
>>
>>   
>>> Adrian Bunk schrieb:
>>> 
 Subject: second suspend to disk in a row results in an oops  (libata?)
 References : http://lkml.org/lkml/2007/3/17/43
 Submitter  : Thomas Meyer <[EMAIL PROTECTED]>
 Status : unknown
   
   
>>> The problem is identified: http://lkml.org/lkml/2007/3/22/150
>>> 
>>
>> Given the description above I'm a little confused.  Doesn't this
>> happen every time now?
>>   
> With current git head the oops happens in the second suspend to disk
> attempt in a row.

Odd.  I would have thought the oops happened in the first resume, not
the second. 

Hmm.  It may have something to do with the ``managed'' driver
aspect of this as well..

>> Or was this happening only the second time before I started my msi
>> fixes... 
>>   
> So i think, that the current git head already contains your msi fixes.

Yes it does.

> I don't know if this already happend before your msi changes, but i can
> test 2.6.20 if you like to?

Sure.  A data point if you boot with nomsi or have a kernel compiled
without msi support would be interesting as well.

As the problem case may not show up without msi support in the picture.

Eric

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux-VServer example results for sharing vs. separate mappings ...

2007-03-24 Thread Herbert Poetzl
On Fri, Mar 23, 2007 at 09:42:35PM -0800, Andrew Morton wrote:
> On Fri, 23 Mar 2007 20:30:00 +0100 Herbert Poetzl <[EMAIL PROTECTED]> wrote:
> 
> > 
> > Hi Eric!
> > Hi Folks!
> > 
> > here is a real world example result from one of my tests
> > regarding the benefit of sharing over separate memory
> > 
> > the setup is quite simple, a typical machine used by
> > providers all over the world, a dual Pentium D 3.2GHz
> > with 4GB of memory and a single 160GB SATA disk running
> > a Linux-VServer kernel (2.6.19.7-vs2.2.0-rc18)
> > 
> > the Guest systems used are Mandriva 2007 guests with
> > syslog, crond, sshd, apache, postfix and postgresql
> > installed and running (all in all 17 processes per guest)
> > 
> > the disk space used by one guests is roughly 148MB
> > 
> > in addition to that, a normal host system is running
> > with a few daemons (like sshd, httpd, postfix ...)
> > 
> > 
> > the first test setup is starting 200 of those guests
> > one after the other and measuring the memory usage
> > before and after the guest did start, as well as 
> > recording the time used to start them ...
> > 
> > this is done right after the machine was rebooted, in
> > one test with 200 separate guests (i.e. 200 x 148MB) 
> > and in a second run with 200 unified guests (which
> > means roughly 138MB of shared files)
> 
> Please define your terms.  
> What is a "separated guest", what is a "unified guest" 
> and how do they differ?

separated guests are complete Linux Distributions which
do not share (filesystem wise) anything with any other
guest ... i.e. all files and executables have to be
paged in and get separate mappings (and thus separate
memory)

unified guests use a mechanism we (Linux-VServer) call
'unification' which can be considered an advanced form
of hard linking (i.e. we add special flags to protect
those hard links from modification. such a file is 
copied on demand (CoW Link Breaking) on the first attempt
to be modified (attributes or content)

so although all guests use a separate namespace (i.e.
will have separate dentries) they share most of the files
(those which are not modified) via inodes (and the inode
cache of course)

> If a "separated" guest is something in which separate 
> guests will use distinct physical pages to cache the 
> contents of /etc/passwd (ie: a separate filesystem 
> per guest) then I don't think that's interesting 
> information, frankly.

well, you didn't bother to answer my questions regarding
your suggested approach yet, and as I am concerned that
some of the suggested approaches sacrifice performance
and resource sharing/efficiency for simplicity or (as
we recently had) 'ability to explain it to the customer'
I thought I provide some data how much resource sharing
can help (the overall performance)

> Because nobody (afaik) is proposing that pagecache be
> duplicated across instances in this fashion.
> 
> We obviously must share pagecache across instances - 
> if we didn't want to do that then we could do something
> completely dumb such as use xen/kvm/vmware/etc ;)

exactly my words ...

> The issue with pagecache (afaik) is that if we use 
> containers based on physical pages (an approach which 
> is much preferred by myself) then we can get in a 
> situation where a pagecache page is physically in 
> container A, is not actually used by any process in 
> container A, but is being releatedly referenced by 
> processes which are in other containers and hence 
> unjustly consumes resources in container A.  

> How significant a problem this is likely to be I do 
> not know. 

well, with a little imagination, you can extrapolate
that from the data you removed from this email, as one
example case would be to start two unified guests one
after the other, then shutdown almost everything in
the first one, you will end up with the first one being
accounted all the 'shared' data used by the second one
while the second one will have roughly the resources
accounted the first one actually uses ...

note that the 'frowned upon' accounting Linux-VServer
does seems to work for those cases quite fine .. here
the relevant accounting/limits for three guests, the
first two unified and started in strict sequence, the
third one completely separate

Limitcurrent min/max  soft/hard hits
VM:41739   0/   64023   -1/  -1  0
RSS:8073   0/9222   -1/  -1  0
ANON:   3110   0/3405   -1/  -1  0
RMAP:   4960   0/5889   -1/  -1  0
SHM:7138   0/7138   -1/  -1  0

Limitcurrent min/max  soft/hard hits
VM:41738   0/   64163   -1/  -1  0
RSS:8058   0/9383   -1/  -1  0
ANON:   3108   0/3505   -1/  -1  0
RMAP:   4950   0/5912   -1/  -1  0
SHM:7138   0/ 

Re: [2/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread Ray Lee
Thomas Gleixner wrote:
>> Patch reproduced below, with an acked-by (and, uhm, a couple of spelling
>> fixes in the description -- don't hate me, 'kay?).
> 
> I know that my English sucks.

Your English is fantastic, and far better than my German ever will be, so
no worries :-).

~r.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [3/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread Thomas Meyer
Eric W. Biederman schrieb:
> Thomas Meyer <[EMAIL PROTECTED]> writes:
>
>   
>> Adrian Bunk schrieb:
>> 
>>> Subject: second suspend to disk in a row results in an oops  (libata?)
>>> References : http://lkml.org/lkml/2007/3/17/43
>>> Submitter  : Thomas Meyer <[EMAIL PROTECTED]>
>>> Status : unknown
>>>   
>>>   
>> The problem is identified: http://lkml.org/lkml/2007/3/22/150
>> 
>
> Given the description above I'm a little confused.  Doesn't this
> happen every time now?
>   
With current git head the oops happens in the second suspend to disk
attempt in a row.
> Or was this happening only the second time before I started my msi
> fixes... 
>   
So i think, that the current git head already contains your msi fixes.
I don't know if this already happend before your msi changes, but i can
test 2.6.20 if you like to?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm snapshot broken-out-2007-03-24-00-14.tar.gz uploaded

2007-03-24 Thread Andrew Morton
On Sat, 24 Mar 2007 18:18:42 +0100 "Michal Piotrowski" <[EMAIL PROTECTED]> 
wrote:

> On 24/03/07, [EMAIL PROTECTED] <[EMAIL PROTECTED]> wrote:
> > The mm snapshot broken-out-2007-03-24-00-14.tar.gz has been uploaded to
> >
> >
> > ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/mm/broken-out-2007-03-24-00-14.tar.gz
> >
> 
> My network doesn't work
> "RTNETLINK answers: Invalid argument"
> 
> git-net* patches?
> 

There's a huge collision between git-net and git-wireless which I haven't
started to look at yet.  I suggest you forget about that particular
snapshot.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] fix sysfs rom file creation for BIOS ROM shadows

2007-03-24 Thread Jesse Barnes
At one time, if a BIOS ROM shadow was detected for the boot video
device (stored at offset 0xc), we'd set a special resource flag,
IORESOURCE_ROM_SHADOW, so that the sysfs ROM file code could handle
it properly.  That broke along the way somewhere though, so current
kernels will be missing 'rom' files in sysfs if the video device
doesn't have an explicit ROM BAR.

This patch fixes the regression by moving the video fixup quirk to a
little later in the boot cycle (to avoid having its work undone by
PCI resource allocation) and checking in the PCI sysfs code whether
a rom file should be created due to a shadow resource, which is also
moved to a little later in the boot cycle so it will occur after the
video fixup.  Tested and works on my i386 test box.

Signed-off-by:  Jesse Barnes <[EMAIL PROTECTED]>

diff -Napur -X /home/jbarnes/dontdiff linux-2.6.21-rc4/arch/i386/pci/fixup.c 
linux-2.6.21-rc4-modesetting/arch/i386/pci/fixup.c
--- linux-2.6.21-rc4/arch/i386/pci/fixup.c  2007-03-15 17:20:01.0 
-0700
+++ linux-2.6.21-rc4-modesetting/arch/i386/pci/fixup.c  2007-03-24 
10:46:57.0 -0700
@@ -354,7 +354,7 @@ static void __devinit pci_fixup_video(st
printk(KERN_DEBUG "Boot video device is %s\n", pci_name(pdev));
}
 }
-DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
+DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
 
 /*
  * Some Toshiba laptops need extra code to enable their TI TSB43AB22/A.
diff -Napur -X /home/jbarnes/dontdiff linux-2.6.21-rc4/drivers/pci/pci-sysfs.c 
linux-2.6.21-rc4-modesetting/drivers/pci/pci-sysfs.c
--- linux-2.6.21-rc4/drivers/pci/pci-sysfs.c2007-03-15 17:20:01.0 
-0700
+++ linux-2.6.21-rc4-modesetting/drivers/pci/pci-sysfs.c2007-03-24 
10:46:42.0 -0700
@@ -620,7 +620,8 @@ int __must_check pci_create_sysfs_dev_fi
goto err_bin_file;
 
/* If the device has a ROM, try to expose it in sysfs. */
-   if (pci_resource_len(pdev, PCI_ROM_RESOURCE)) {
+   if (pci_resource_len(pdev, PCI_ROM_RESOURCE) ||
+   (pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW)) {
rom_attr = kzalloc(sizeof(*rom_attr), GFP_ATOMIC);
if (rom_attr) {
pdev->rom_attr = rom_attr;
@@ -695,4 +696,4 @@ static int __init pci_sysfs_init(void)
return 0;
 }
 
-__initcall(pci_sysfs_init);
+late_initcall(pci_sysfs_init);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [3/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread Eric W. Biederman
Thomas Meyer <[EMAIL PROTECTED]> writes:

> Adrian Bunk schrieb:
>> Subject: second suspend to disk in a row results in an oops  (libata?)
>> References : http://lkml.org/lkml/2007/3/17/43
>> Submitter  : Thomas Meyer <[EMAIL PROTECTED]>
>> Status : unknown
>>   
>
> The problem is identified: http://lkml.org/lkml/2007/3/22/150

Given the description above I'm a little confused.  Doesn't this
happen every time now?

Or was this happening only the second time before I started my msi
fixes... 

Eric
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


glibc vs 2.6/2.4

2007-03-24 Thread Yakov Lerner

Can I build recent glibc such that it will work both on
2.6 and on 2.4 ? (multithreading-wise, I suppose). I tried
to boot recent 2.6-based distro with 2.4 kernel and it did not work.

Do I need to set some env.vars maybe (LD_ASSUME_KERNEL ?
GNU_LIBPTHREAD_VERSION ?) for glibc when I switch kernels ?

Yakov Lerner
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Why is NCQ enabled by default by libata? (2.6.20)

2007-03-24 Thread Alan Cox
On Sat, 24 Mar 2007 12:38:02 -0400 (EDT)
Justin Piszcz <[EMAIL PROTECTED]> wrote:

> Without NCQ, performance is MUCH better on almost every operation, with 
> the exception of 2-3 items.

It depends on the drive. Generally NCQ is better but some drive firmware
isn't too bright and there are probably still cases where we get bad
interactions in the kernel code that want tuning too
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [rfc][patch] queued spinlocks (i386)

2007-03-24 Thread Ingo Molnar

* Nikita Danilov <[EMAIL PROTECTED]> wrote:

> Indeed, this technique is very well known. E.g., 
> http://citeseer.ist.psu.edu/anderson01sharedmemory.html has a whole 
> section (3. Local-spin Algorithms) on them, citing papers from the 
> 1990 onward.

that is a cool reference! So i'd suggest to do (redo?) the patch based 
on those concepts and that terminology and not use 'queued spinlocks' 
that are commonly associated with MS's stuff. And as a result the 
contended case would be optimized some more via local-spin algorithms. 
(which is not a key thing for us, but which would be nice to have 
nevertheless)

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm snapshot broken-out-2007-03-24-00-14.tar.gz uploaded

2007-03-24 Thread Michal Piotrowski

On 24/03/07, [EMAIL PROTECTED] <[EMAIL PROTECTED]> wrote:

The mm snapshot broken-out-2007-03-24-00-14.tar.gz has been uploaded to

   
ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/mm/broken-out-2007-03-24-00-14.tar.gz



My network doesn't work
"RTNETLINK answers: Invalid argument"

git-net* patches?

Regards,
Michal

--
Michal K. K. Piotrowski
LTG - Linux Testers Group (PL)
(http://www.stardust.webpages.pl/ltg/)
LTG - Linux Testers Group (EN)
(http://www.stardust.webpages.pl/linux_testers_group_en/)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: PATCH: tun/tap driver hw address handling

2007-03-24 Thread Ahmed S. Darwish
Hi Brian,

On Sat, Mar 24, 2007 at 01:56:50AM -0700, Brian Braunstein wrote:
> 
> Linus,
> 
>  According to Documentation/SubmittingPatches "bug fixes" or "obvious" 
> changes
>  should CCed to you, so this is why I have done this.
>

IMHO these days patches got reviewed on LKML, then tested enough on the
unstable -mm tree then it got added to mainline kernel. Subsystem maintaners
can also add patches directly to mainline if they're trivial enough.
 
> Note: This entire email can be found at
> http://bristyle.com/share/patch-tuntap-hw_addr_handling.txt
> 

I think No need for such two lines. everyone uses his favourite LKML archive.

>  Summary:
>Fix tun/tap driver's handling of hw addresses.  Specifically, ensure 
> that
>when the tun.dev_addr field is set, the net_device.dev_addr field gets
>set to the same value.
> 
>  Background:
>The device hw address is stored in 2 places, in the tun.dev_addr field,
>and of course the net_device struct's dev_addr field.  It really 
> seems to
>me that the tun.dev_addr field is redundant, and that anywhere it is 
> used

Editor/mailer wrapping your lines badly ?

> 
> --- linux-2.6.20.4-ORIG/drivers/net/tun.c   2007-03-23 
> 12:52:51.0 -0700
> +++ linux-2.6.20.4/drivers/net/tun.c2007-03-24 01:36:59.0 -0700
> @@ -18,6 +18,11 @@

Please reread SubmittingPatches to know the canonical patch format (missing
Signed-off-by and others).

> /*
>  *  Changes:
>  *
> + *  Brian Braunstein <[EMAIL PROTECTED]> 2007/03/23
> + *Fixed hw address handling.  Now net_device.dev_addr is kept 
> consistent
[Remaing patch]

Patch can't be applied or even read cause your mailer has mistakenly wrapped
its lines.

Regards,

-- 
Ahmed S. Darwish
http://darwish.07.googlepages.com

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [3/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread Thomas Meyer
Adrian Bunk schrieb:
> Subject: second suspend to disk in a row results in an oops  (libata?)
> References : http://lkml.org/lkml/2007/3/17/43
> Submitter  : Thomas Meyer <[EMAIL PROTECTED]>
> Status : unknown
>   

The problem is identified: http://lkml.org/lkml/2007/3/22/150



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [-mm patch] drivers/scsi/constants.c: make 2 functions static

2007-03-24 Thread Adrian Bunk
On Sat, Mar 24, 2007 at 12:11:05PM -0400, Douglas Gilbert wrote:
> Adrian Bunk wrote:
> > On Mon, Mar 19, 2007 at 08:56:23PM -0800, Andrew Morton wrote:
> >> ...
> >> Changes since 2.6.21-rc3-mm1:
> >> ...
> >>  git-scsi-misc.patch
> >> ...
> >>  git trees
> >> ...
> > 
> > 
> > This patch makes two needlessly global functions static.
> > 
> > Signed-off-by: Adrian Bunk <[EMAIL PROTECTED]>
> > 
> > ---
> > 
> >  drivers/scsi/constants.c |4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > --- linux-2.6.21-rc4-mm1/drivers/scsi/constants.c.old   2007-03-23 
> > 23:26:39.0 +0100
> > +++ linux-2.6.21-rc4-mm1/drivers/scsi/constants.c   2007-03-23 
> > 23:26:55.0 +0100
> > @@ -1235,7 +1235,7 @@
> >  }
> >  EXPORT_SYMBOL(scsi_print_sense_hdr);
> >  
> > -void
> > +static void
> >  scsi_decode_sense_buffer(const unsigned char *sense_buffer, int sense_len,
> >struct scsi_sense_hdr *sshdr)
> >  {
> > @@ -1258,7 +1258,7 @@
> > }
> >  }
> >  
> > -void
> > +static void
> >  scsi_decode_sense_extras(const unsigned char *sense_buffer, int sense_len,
> >  struct scsi_sense_hdr *sshdr)
> >  {
> 
> Adrian,
> Who put those functions in?



[SCSI] constants.c: cleanup, verbose result printing

From: Martin K. Petersen

Clean up constants.c and make result printing more user friendly:

 - Refactor the command and sense functions so that the actual
   formatting can be called from the various helper functions with the
   correct prefix.

 - Replace scsi_print_hostbyte() and scsi_print_driverbyte() with
   scsi_print_result() which is verbose when CONFIG_SCSI_CONSTANTS is
   on.

Signed-off-by: Martin K. Petersen <[EMAIL PROTECTED]>
Signed-off-by: James Bottomley <[EMAIL PROTECTED]>



> The names and arguments look very similar to these
> exported functions in scsi_error.c *** :
>   scsi_normalize_sense
>   scsi_sense_desc_find
>   scsi_get_sense_info_fld
> 
> that I can see in 2.6.21-rc4
> 
> The proposed scsi_decode_sense_buffer() looks broken because
> it can fail and should return an int reflecting that.
> How scsi_decode_sense_extras() works is intriguing, unless
> struct scsi_sense_hdr has been changed as well.
> 
> 
> *** Putting sense decode logic in scsi_error.c is wrong
> because:
>   - the ATA command set is proposing an ATA REQUEST SENSE
> command to yield a sense buffer
>   - sense buffers don't necessarily indicate errors.
> 
> So moving those functions out of scsi_error.c IMO is a
> good idea. Breaking them in the move isn't.
> 
> Doug Gilbert
> 

cu
Adrian

-- 

   "Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
   "Only a promise," Lao Er said.
   Pearl S. Buck - Dragon Seed

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Why is NCQ enabled by default by libata? (2.6.20)

2007-03-24 Thread Justin Piszcz
Without NCQ, performance is MUCH better on almost every operation, with 
the exception of 2-3 items.


/usr/sbin/bonnie++ -d /x/bonnie -s 7952 -m p34 -n 16:10:16:64 > run.txt;

# Average of 3 runs with NCQ on for Quad Raptor ADFD 150 RAID 5 Software RAID:
p34-ncq-on,7952M,43916.3,96.6667,151943,28.6667,75794.3,18.6667,48991.3,99,181687,24,558.033,0.33,16:10:16/64,867.667,9,29972.7,98.,2801.67,16,890.667,9.3,27743,94.,2115.33,15.6667
# Average of 3 runs with NCQ off for Quad Raptor ADFD 150 RAID 5 Software RAID:
p34-ncq-off,7952M,42470,97.,200409,36.,90240.3,22.6667,48656,99,198853,27,546.467,0,16:10:16/64,972.333,10,21833,72.,3697,21,995,10.6667,27901.7,95.6667,2681,20.6667

http://home.comcast.net/~jpiszcz/ncq_vs_noncq/results.html

In general, for networking, etc, the kernel chooses 'optimized' defaults, 
therefore, I was curious why is NCQ enabled by default?


Justin.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: troubles with DAC960

2007-03-24 Thread Randy Dunlap
On Sat, 24 Mar 2007 15:21:16 +0100 Boris Andratzek wrote:

> Boris Andratzek wrote:
> > Hello members of the kernel-list,
> > 
> > 
> > I'm new to this and hope I don't misuse the list in any way.
> > 
> > Doing the update from debian sarge to etch on my server I ran into the
> > bug documented here: bugzilla.kernel.org/show_bug.cgi?id=7177
> > I already made some comments there. I tried to contact the maintainer of
> > the DAC960 driver but as far as I found out, Leonard N. Zubkoff
> > unfortunatly died in a helicopter crash.
> > 
> > Without being impious, I want to ask if anybody can give me an idea of
> > how this thing will go on. Is there anybody new doing the maintaining?
> > Is there hope for a fix? Where can I find more information? Can I do
> > anything to help (testing, logging)?
> > 
> > regards,
> > 
> > Boris Andratzek
> > 
> 
> Hello!?
> 
> I didn't catch no reaction at all, so I am not sure if I sent my
> question the right way! Could anybody respond, please, even without an
> idea of solving my problem!?
> 
> Thanks,

There is no current DAC960 maintainer.

I'd try asking on the linux-scsi mailing list (cc-ed there).

---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ckrm-tech] [PATCH 7/7] containers (V7): Container interface to nsproxy subsystem

2007-03-24 Thread Srivatsa Vaddagiri
On Sat, Mar 24, 2007 at 10:35:37AM +0530, Srivatsa Vaddagiri wrote:
> > +static int ns_create(struct container_subsys *ss, struct container *cont)
> > +{
> > +   struct nscont *ns;
> > +
> > +   if (!capable(CAP_SYS_ADMIN))
> > +   return -EPERM;
> 
> Does this check break existing namespace semantics in a subtle way?
> It now requires that unshare() of namespaces by any task requires
> CAP_SYS_ADMIN capabilities.

I should clarify that I am referring to unshare thr' clone here (and not
thr' sys_unshare)

> clone(.., CLONE_NEWUTS, ..)->copy_namespaces()->ns_container_clone()->
>   ->container_clone()-> .. -> container_create() -> ns_create()
> 
> Earlier, one could unshare his uts namespace w/o CAP_SYS_ADMIN
> capabilities. Now it is required. Is that fine? Don't know.
> 
> I feel we can avoid this check totally and let the directory permissions
> take care of these checks.
> 
> Serge, what do you think?

-- 
Regards,
vatsa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [4/5] 2.6.21-rc4: known regressions (v2)

2007-03-24 Thread Michal Piotrowski

On 24/03/07, Thomas Gleixner <[EMAIL PROTECTED]> wrote:

On Sat, 2007-03-24 at 14:59 +0100, Michal Piotrowski wrote:
> On 23/03/07, Thomas Gleixner <[EMAIL PROTECTED]> wrote:
> > On Fri, 2007-03-23 at 19:50 +0100, Adrian Bunk wrote:
> > > Subject: soft lockup detected on CPU#0
> > > References : http://lkml.org/lkml/2007/3/3/152
> > > Submitter  : Michal Piotrowski <[EMAIL PROTECTED]>
> > > Handled-By : Thomas Gleixner <[EMAIL PROTECTED]>
> > >  Ingo Molnar <[EMAIL PROTECTED]>
> > > Status : unknown
> >
> > Michal,
> >
> > any news on that one ?
> >
> > You said the same problem exists in 2.6.20.1. Has this been resolved in
> > 2.6.20.2/3
>
> Yes, I tried 2.6.20.4 and it works fine.

Is it solved in Linus latest too ?


Yes, it's solved.

Adrian, please remove this bug from known regressions list.
It's fixed in the latest -git and -stable.

Regards,
Michal

--
Michal K. K. Piotrowski
LTG - Linux Testers Group (PL)
(http://www.stardust.webpages.pl/ltg/)
LTG - Linux Testers Group (EN)
(http://www.stardust.webpages.pl/linux_testers_group_en/)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: error in compilation kernel 2.6.19 - internal compiler error

2007-03-24 Thread Paolo Ornati
On Sat, 24 Mar 2007 16:45:46 +0100
Thibaud Hulin <[EMAIL PROTECTED]> wrote:

> I'm triyng to compile the kernel 2.6.19 on Debian Testing 4.0
> Unfortunately, I can't success.
> This is my error message :
> 
> CC [M]  drivers/scsi/lpfc/lpfc_sli.o
> In file included from drivers/scsi/lpfc/lpfc_sli.c:23:
> include/linux/pci.h:251: internal compiler error: in build_int_cst_wide, 
> at tree.c:803
> Please submit a full bug report,
> with preprocessed source if appropriate.
> See http://gcc.gnu.org/bugs.html> for instructions.
> For Debian GNU/Linux specific bug reporting instructions,
> see .
> The bug is not reproducible, so it is likely a hardware or OS problem.


Have you read the error message?

It could be a memory problem (defective module)... try run memtest86 or
memtest86+ (http://www.memtest.org/) for many hours (>= 8, someone
suggest even 24h, just to be sure :).

If the problem is big memtest should find it soon.

-- 
Paolo Ornati
Linux 2.6.20.4 on x86_64
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [-mm patch] drivers/scsi/constants.c: make 2 functions static

2007-03-24 Thread Douglas Gilbert
Adrian Bunk wrote:
> On Mon, Mar 19, 2007 at 08:56:23PM -0800, Andrew Morton wrote:
>> ...
>> Changes since 2.6.21-rc3-mm1:
>> ...
>>  git-scsi-misc.patch
>> ...
>>  git trees
>> ...
> 
> 
> This patch makes two needlessly global functions static.
> 
> Signed-off-by: Adrian Bunk <[EMAIL PROTECTED]>
> 
> ---
> 
>  drivers/scsi/constants.c |4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> --- linux-2.6.21-rc4-mm1/drivers/scsi/constants.c.old 2007-03-23 
> 23:26:39.0 +0100
> +++ linux-2.6.21-rc4-mm1/drivers/scsi/constants.c 2007-03-23 
> 23:26:55.0 +0100
> @@ -1235,7 +1235,7 @@
>  }
>  EXPORT_SYMBOL(scsi_print_sense_hdr);
>  
> -void
> +static void
>  scsi_decode_sense_buffer(const unsigned char *sense_buffer, int sense_len,
>  struct scsi_sense_hdr *sshdr)
>  {
> @@ -1258,7 +1258,7 @@
>   }
>  }
>  
> -void
> +static void
>  scsi_decode_sense_extras(const unsigned char *sense_buffer, int sense_len,
>struct scsi_sense_hdr *sshdr)
>  {

Adrian,
Who put those functions in?

The names and arguments look very similar to these
exported functions in scsi_error.c *** :
  scsi_normalize_sense
  scsi_sense_desc_find
  scsi_get_sense_info_fld

that I can see in 2.6.21-rc4

The proposed scsi_decode_sense_buffer() looks broken because
it can fail and should return an int reflecting that.
How scsi_decode_sense_extras() works is intriguing, unless
struct scsi_sense_hdr has been changed as well.


*** Putting sense decode logic in scsi_error.c is wrong
because:
  - the ATA command set is proposing an ATA REQUEST SENSE
command to yield a sense buffer
  - sense buffers don't necessarily indicate errors.

So moving those functions out of scsi_error.c IMO is a
good idea. Breaking them in the move isn't.

Doug Gilbert


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH UPDATED][2] cosmetic adaption of drivers/ide/Kconfig concerning SATA

2007-03-24 Thread Patrick Ringl

Bartlomiej Zolnierkiewicz wrote:

On Sunday 18 March 2007, Patrick Ringl wrote:
  

Bartlomiej Zolnierkiewicz wrote:

Hello,


On Sunday 18 March 2007, Patrick Ringl wrote:
  

Hello,


Hi,

  
since especially Serial ATA has it's own menu point now, I guess we can 
change the description of the deprecated SATA driver as well, since the 
new libATA subsystem is not configured through a SCSI low-level driver 
anymore, but has it's own menu point.


The following patch is against 2.6.21-rc4:

--- linux-2.6.20.old/drivers/ide/Kconfig2007-03-18 00:05:11.0 
+0100
+++ linux-2.6.20/drivers/ide/Kconfig2007-03-18 00:09:47.0 +0100
@@ -103,7 +103,7 @@
---help---
  There are two drivers for Serial ATA controllers.

- The main driver, "libata", exists inside the SCSI subsystem
+ The main driver, "libata", exists inside the ATA subsystem


Strictly speaking libata is not a separate subsystem (it still uses SCSI
subsystem) and "ATA subsystem" may be misleading, since we now have:

* "ATA/ATAPI/MFM/RLL support" menu for drivers/ide

* "Serial ATA (prod) and Parallel ATA (experimental) drivers" menu for libata

What about replacing "exists inside" into "uses" and adding info about
the new menu instead?
  
Well, that's even a better idea :-) I wasn't that sure about what to do 
.. it's just that it could be misleading since the new (s/p)ata drivers 
are not living in the scsi low-level subsystem anymore, but got their 
own menu point.


Here's a different patch >:)



applied

[ The patch was whitespace damaged and didn't apply et all
  so I had to manually change the Kconfig to merge your change. ]
  
D'Oh .. Probably due to the fact that I copied it and did not attach it 
in plain/text.


Thanks


Thanks,
Bart

  


regards,
Patrick
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] [USB] [PL2303]: fixed to skip NULL entry in pl2303_shutdown.

2007-03-24 Thread Noriaki TAKAMIYA
Hi,

  While booting, this entry is set to NULL in destroy_serial(),
  but serial->port is referred again in pl2303_shutdown() via
  serial->type->shutdown.
---
 drivers/usb/serial/pl2303.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 83dfae9..d631f8c 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -905,6 +905,8 @@ static void pl2303_shutdown(struct usb_s
dbg("%s", __FUNCTION__);
 
for (i = 0; i < serial->num_ports; ++i) {
+   if (!serial->port[i])
+   continue;
priv = usb_get_serial_port_data(serial->port[i]);
if (priv) {
pl2303_buf_free(priv->buf);
-- 
1.4.4

--
Noriaki TAKAMIYA
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: (usagi-core 32633) [PATCH 1/2] [USB] [PL2303]: fixed to skip NULL entry in pl2303_shutdown.

2007-03-24 Thread Noriaki TAKAMIYA
Sorry for resending.

  While booting, this entry is set to NULL in destroy_serial(),
  but serial->port is referred again in pl2303_shutdown() via
  serial->type->shutdown.

Signed-off-by: Noriaki TAKAMIYA <[EMAIL PROTECTED]>

---
 drivers/usb/serial/pl2303.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 83dfae9..d631f8c 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -905,6 +905,8 @@ static void pl2303_shutdown(struct usb_s
dbg("%s", __FUNCTION__);
 
for (i = 0; i < serial->num_ports; ++i) {
+   if (!serial->port[i])
+   continue;
priv = usb_get_serial_port_data(serial->port[i]);
if (priv) {
pl2303_buf_free(priv->buf);
-- 
1.4.4

--
Noriaki TAKAMIYA


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] [USB] [SERIAL]: fixed to skip NULL port entry in struct usb_serial_port.

2007-03-24 Thread Noriaki TAKAMIYA
Hi,

  This patch fixes to skip serial->port[i] if it is set NULL.
---
 include/linux/usb/serial.h |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 32acbae..85ed5ef 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -99,7 +99,10 @@ struct usb_serial_port {
 /* get and set the port private data pointer helper functions */
 static inline void *usb_get_serial_port_data (struct usb_serial_port *port)
 {
-   return dev_get_drvdata(&port->dev);
+   if (port)
+   return dev_get_drvdata(&port->dev);
+   else
+   return NULL;
 }
 
 static inline void usb_set_serial_port_data (struct usb_serial_port *port, 
void *data)
-- 
1.4.4

--
Noriaki TAKAMIYA
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: (usagi-core 32633) [PATCH 1/2] [USB] [PL2303]: fixed to skip NULL entry in pl2303_shutdown.

2007-03-24 Thread Noriaki TAKAMIYA
Sorry for resending

Hi,

  While booting, this entry is set to NULL in destroy_serial(),
  but serial->port is referred again in pl2303_shutdown() via
  serial->type->shutdown.
---
 drivers/usb/serial/pl2303.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 83dfae9..d631f8c 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -905,6 +905,8 @@ static void pl2303_shutdown(struct usb_s
dbg("%s", __FUNCTION__);
 
for (i = 0; i < serial->num_ports; ++i) {
+   if (!serial->port[i])
+   continue;
priv = usb_get_serial_port_data(serial->port[i]);
if (priv) {
pl2303_buf_free(priv->buf);
-- 
1.4.4

--
Noriaki TAKAMIYA
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   >