Re: 2.6.24.2-rt2

2008-02-26 Thread Steven Rostedt


On Tue, 26 Feb 2008, Jan Kiszka wrote:

 Jan Kiszka wrote:
  At this chance: We still see the same unbalanced sched-other load on our
  NUMA box as Gernot once reported [1]:
 
  top - 11:19:20 up 4 min,  1 user,  load average: 29.52, 9.54, 3.37
  Tasks: 502 total,  41 running, 461 sleeping,   0 stopped,   0 zombie
  Cpu0  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu1  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu2  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu3  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu4  :  0.0%us,  0.3%sy,  0.0%ni, 99.7%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu5  :  0.0%us,  0.0%sy,  0.0%ni,100.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu6  :  0.0%us,  0.0%sy,  0.0%ni,100.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu7  :  0.0%us,  0.0%sy,  0.0%ni,100.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu8  :  0.0%us,  0.0%sy,  0.0%ni,100.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu9  :  0.0%us,  0.0%sy,  0.0%ni,100.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu10 :  0.0%us,  0.0%sy,  0.0%ni,100.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu11 :  0.0%us,  0.0%sy,  0.0%ni,100.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu12 :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu13 :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu14 :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Cpu15 :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  
  0.0%st
  Mem:  65513284k total,  1032032k used, 64481252k free, 6444k buffers
  Swap:  3204896k total,0k used,  3204896k free,37312k cached
 

 ETOOMANYKERNELS, this was from 2.6.23.12-rt14. 2.6.24.2-rt2 shows a
 different patter under identical load:

There has been CFS updates, which may account for the differences. Seems
better though.


 top - 12:55:27 up 2 min,  1 user,  load average: 9.97, 2.42, 0.81
 Tasks: 491 total,  42 running, 449 sleeping,   0 stopped,   0 zombie
 Cpu0  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu1  : 99.7%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
 Cpu2  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu3  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu4  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu5  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu6  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu7  :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu8  :  0.0%us,  0.0%sy,  0.0%ni,100.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu9  :  0.3%us,  0.3%sy,  0.0%ni, 99.3%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu10 :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu11 :  0.0%us,  0.0%sy,  0.0%ni,100.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu12 :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu13 :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu14 :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Cpu15 :100.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
 Mem:  65512480k total,   580704k used, 64931776k free, 8964k buffers
 Swap:  3204896k total,0k used,  3204896k free,   129720k cached


What's the NUMA topology? What tasks are running, and at what priorities?

Those three idle CPUS, should they have tasks running on them?

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24.2-rt2

2008-02-26 Thread Steven Rostedt


On Tue, 26 Feb 2008, Jan Kiszka wrote:
 
 
  What's the NUMA topology?

 4 nodes. I'm not sure if it is really NUMA related, but the same kernel
 runs that test as expected on a non-NUMA 2x2 box.

  What tasks are running, and at what priorities?

 40 pthreads, created with default parameters from a main thread which
 runs with default parameters as well. The threads simply run endless loops.

 
  Those three idle CPUS, should they have tasks running on them?

 For sure, given the overload situation of the system (40x full load vs.
 16 cores). Neither did we fiddle with any parameter of the system
 (knowingly, its a standard openSUSE 10.3 underneath) nor did we set
 thread affinities.


Do you get different behaviour with 2.6.24.2?

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH [RT] 05/14] rearrange rt_spin_lock sleep

2008-02-22 Thread Steven Rostedt

On Fri, 22 Feb 2008, Gregory Haskins wrote:

 Gregory Haskins wrote:
  @@ -732,14 +741,15 @@ rt_spin_lock_slowlock(struct rt_mutex *lock)
 
  debug_rt_mutex_print_deadlock(waiter);
 
  -   schedule_rt_mutex(lock);
  +   update_current(TASK_UNINTERRUPTIBLE, saved_state);

 I have a question for everyone out there about this particular part of
 the code. Patch 6/14 adds an optimization that is predicated on the
 order in which we modify the state==TASK_UNINTERRUPTIBLE vs reading the
 waiter.task below.

 My assumption is that the xchg() (inside update_current()) acts as an
 effective wmb().  If xchg() does not have this property, then this code
 is broken and patch 6/14 should also add a:


 +   smp_wmb();

I believe that the wmb would be needed. I doubt that xchg on all archs
would force any ordering of reads and writes. It only needs to guarantee the
atomic nature of the data exchange. I don't see any reason that it would
imply any type of memory barrier.

-- Steve




  +   if (waiter.task)
  +   schedule_rt_mutex(lock);
  +   else
  +   update_current(TASK_RUNNING_MUTEX, saved_state);
 
  spin_lock_irqsave(lock-wait_lock, flags);
  current-flags |= saved_flags;
  current-lock_depth = saved_lock_depth;
  -   state = xchg(current-state, TASK_UNINTERRUPTIBLE);
  -   if (unlikely(state == TASK_RUNNING))
  -   saved_state = TASK_RUNNING;


 Does anyone know the answer to this?

 Regards,
 -Greg

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.24.2-rt2

2008-02-21 Thread Steven Rostedt
We are pleased to announce the 2.6.24.2-rt2 tree, which can be
downloaded from the location:

  http://rt.et.redhat.com/download/

Information on the RT patch can be found at:

  http://rt.wiki.kernel.org/index.php/Main_Page

Changes since 2.6.24-rt1

  - ported to 2.6.24.2

  - *** New ftrace utility ***
  The old latency_tracer has now been replaced with the cleaned up
  version that is being prepared for mainline.

  - compiler warning fix (Shi Weihua)


to build a 2.6.24.2-rt2 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.24.tar.bz2
  http://kernel.org/pub/linux/kernel/v2.6/patch-2.6.24.2.bz2
  http://rt.et.redhat.com/download/patch-2.6.24.2-rt2.bz2


And like always, my RT version of Matt Mackall's ketchup will get this
for you nicely:

  http://people.redhat.com/srostedt/rt/tools/ketchup-0.9.8-rt3


The broken out patches are also available.



-- Steve



-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2.6.24-rt1] timer:fix build warning in timer.c

2008-02-14 Thread Steven Rostedt


On Thu, 14 Feb 2008, Shi Weihua wrote:

 Fix the following compile warning without CONFIG_PREEMPT_RT:
 kernel/timer.c:937: warning: ‘count_active_rt_tasks’ defined but not used

 Signed-off-by: Shi Weihua [EMAIL PROTECTED]

Thanks, applied.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] add task migration_disable critical section

2008-02-12 Thread Steven Rostedt

On Tue, 12 Feb 2008, Gregory Haskins wrote:

 This patch adds a new critical-section primitive pair:

 migration_disable() and migration_enable()

This is similar to what Mathieu once posted:

http://lkml.org/lkml/2007/7/11/13

Not sure the arguments against (no time to read the thread again). But I'd
recommend that you read it.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Use the RT Latency Trace? - LET ME KNOW!

2008-01-25 Thread Steven Rostedt

On Fri, 25 Jan 2008, Kevin Hilman wrote:
  I'm about to gut the RT latency tracer with the version I'm pushing
  upstream. This will be some of the changes:

 Might I request that you follow the mainline stabilization model and
 wait until the 2.6.25-rcX-rtY series to make a significant change like
 this?

 I have nothing against your changes, but am not crazy about seeing
 something like this happen in the stable part of the dev cycle.


??? - I'm unaware of a stable rt kernel. But I will grant you. I planned
on (and announced this) that 2.6.24-rt1 will not include the change. But
I'm hoping to use -rt as a place to show that the code is stable enough
for mainline, and that is why I'm pushing so hard for this now. I want
some of the code to be in 2.6.25-rc1.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.24-rt1

2008-01-25 Thread Steven Rostedt
We are pleased to announce the 2.6.24-rt1 tree, which can be
downloaded from the location:

  http://rt.et.redhat.com/download/

Information on the RT patch can be found at:

  http://rt.wiki.kernel.org/index.php/Main_Page

Changes since 2.6.24-rc8-rt1

  - ported to 2.6.24


to build a 2.6.24-rt1 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.24.tar.bz2
  http://rt.et.redhat.com/download/patch-2.6.24-rt1.bz2


And like always, my RT version of Matt Mackall's ketchup will get this
for you nicely:

  http://people.redhat.com/srostedt/rt/tools/ketchup-0.9.8-rt3


The broken out patches are also available.


*** NOTICE ***

This still has the old version of the latency tracer. I'll try to
release a -rt2 soon that has the new version. This way we can see what
kind of regressions the new version might give.

-- Steve



-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Use the RT Latency Trace? - LET ME KNOW!

2008-01-24 Thread Steven Rostedt

Hi all,

I'm about to gut the RT latency tracer with the version I'm pushing
upstream. This will be some of the changes:

Move from /proc to /debugfs

instead of all the files in proc (for which I don't even know what half of
them do) will be condensed into a few files in /debugfs/tracing

  available_tracers
   This file holds the available traces that you can use. Note, unlike
   -rt, you can choose between tracing preemptoff, irqsoff or preempt
   and irqs off latency at runtime. Here's the available tracers:

events - traces kernel events (like EVENT_TRACE)

wakeup - trace wakeup timings

preemptirqsoff - trace preempt and irqs off latencies

preemptoff - trace only preempt off latencies

irqsoff - trace irqs off latencies

function - trace functions (mcount)

sched_switch - a context switch tracer (watch processes get
scheduled in and out)

   Only one trace may be enabled at a time. Most of these traces also have
   a function equivalent, which will be activate if function trace is
   compiled in and sysctl kernel.mcount_enabled=1.

  current_tracer
   This holds the current active trace (from above). To change or activate
   a trace, simple echo in one of the above traces into this file.

e.g.  echo irqsoff  /debugfs/tracing/current_tracer

  iter_ctrl
   This file controls the output of traces. So far there's only two
   different commands:

verbose (noverbose) - change to latency_tracer verbose output.

symonly (nosymonly) - only print kallsyms addr, and leave out the
hex IP addr.

  latency_trace
   This is where one can read the latency trace from.
(the old /proc/latency_trace)

  trace
   This is another format for output. A little easier to read but not as
   much information.

  trace_ctrl
   echo 1  starts the tracer, echo 0  stops (for all traces)

  tracing_max_latency
   This is just like the old preempt_max_latency for those that use it
   (preempt and irqs off as well as wakeup)

  tracing_thresh
   This is just like the old preempt_thresh.

Now the users has full access to start and stop the trace from these
files.

Note: I've been pushing a lot of this mainline. Some of this will not be
pushed mainline. Specifically, the event trace. Since the event trace does
a lot that LTTng does. I will probably wait till the markers of LTTng go
in. Or perhaps I'll simply push event trace, but that's still TBA.

I'll also hack up the prctl(0,0) - trace off - and prctl(0,1) trace on for
-rt only. This will not be something that will be pushed upstream.

If there's people on this list that use tools for latency tracing, let me
know and I can help you in the conversion. Also note some other
differences.

 - There will be some slightly different output formats. One is that the
trace_special which seems to always pass in an eip will now convert that
eip into a comm (kallsyms).

- The hrtimer_trace will use a hex value for the timer pointer instead of
a decimal.

- context switching is done a bit different:

e.g.
  bash-3138  1d..3 342us+: 3138:120:S -- 3131:120

This is a sched switch from bash 3138 of prio 120 in sleep state to task
sshd 3131 (I know this because of the next line down that I didn't show
;-) of prio 120.

But here's just a heads up on the new tracer. If there's issues, I'll try
to work them out.

Expect this new tracing format to be in either 2.6.24-rc8-rt2 or
2.6.24-rt2. (if 2.6.24 comes out, I may just port to it without adding the
tracer).

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt2

2008-01-21 Thread Steven Rostedt

On Mon, 21 Jan 2008, Esben Nielsen wrote:

 Please, tell what in the license forbids me to make a global replacement
 EXPORT_SYMBOL_GPL - EXPORT_SYMBOL and distribute the result?

If you want to distribute that code, the authors of that said code
may be able to challenge you in saying that you are enabling a means to
circumvent a way around the license, and hold you liable. Remember, all it
takes is one country with the laws that will grant this complaint.


 For me, on the other hand, it is against the spirit of free software to
 actively make a block for people to do what ever they want with the code
 when they are only doing it to themselves. That includes loading non-GPL
 software into the kernel. The only thing they  are not allowed to do is to
 distribute it and in that way hurt other people.

Honestly, I don't care which export it is. The thing is that I derived
that code from someone else. I did not look up the original author of the
code to find out which export they would like it to be. I may be able to
argue that since it was under a LGPL and not a GPL license, I may very
well be able to export it that way.

I'm taking the safe way out. By exporting it as EXPORT_SYMBOL_GPL, I am
safe either way. By exporting it as EXPORT_SYMBOL without first hearing
from the original author (and getting that in writing), or hearing it from
a lawyer, I may be putting myself at risk.

Feel free to creating a version of this code and
s/EXPORT_SYMBOL_GPL/EXPORT_SYMBOL/ and distribute it. I wont come after
you for that, but at least I know those that would, will go after you and
not me.

Call me a chicken, I don't care, but I'm just not going to put myself nor
my company I work for, at risk over this issue.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Would like to run better stress test.

2008-01-18 Thread Steven Rostedt

On Thu, 17 Jan 2008, Mark Knecht wrote:

 I found source code for hackbench but no instructions for someone at
 my level to build it. I tried pi_stress but it hung my machine.

$ gcc -o hackbench hackbench.c
$ while :; do ./hackbench 50; done

That's basically all I do.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why not creating a GIT RT tree ?

2008-01-18 Thread Steven Rostedt

On Fri, 18 Jan 2008, James Cloos wrote:

  git://git.kernel.org/pub/scm/linux/kernel/git/cloos/rt-2.6.git
 http://www.kernel.org/pub/scm/linux/kernel/git/cloos/rt-2.6.git

Thanks for doing this. We might want to add this to the wiki.

   http://rt.wiki.kernel.org/index.php/Main_Page

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt2

2008-01-17 Thread Steven Rostedt

On Thu, 17 Jan 2008, Mariusz Kozlowski wrote:

 Hello,

   I found this in dmesg:
  
   BUG: swapper:0 task might have lost a preemption check!
   Pid: 0, comm: swapper Not tainted 2.6.24-rc7-rt2 #3
[c010386b] show_trace_log_lvl+0x1d/0x3b
[c01042f3] show_trace+0x12/0x14
[c0104a2f] dump_stack+0x6a/0x70
[c0115419] preempt_enable_no_resched+0x5c/0x5e
 
  This is really really strange. cpu_idle calls __preempt_enable_no_resched
  and not preempt_enable_no_resched (notice the prefixed underscores).
  So I don't know how you got that output. Did you get any strance rejects
  in applying this patch?

 Nope. Your rt patch applied cleanly to vanilla 2.6.24-rc7.


OK, do you still get this message? Also I'm assuming this is x86, right?

Could you also do the following.

Go into your kernel build directory.
Start up gdb (I'm hoping that you compiled with DEBUG_INFO)
  gdb vmlinux
(gdb) li *0xc0100e35

and show me what you get.

Thanks,

-- Steve


[c0100e35] cpu_idle+0x6d/0x82
[c0323b6e] rest_init+0x66/0x68
[c043aba6] start_kernel+0x20c/0x276
[] 0x0
===
   ---
   | preempt count:  ]
   | 0-level deep critical section nesting:
   
  
   Box runs fine though.

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc8-rt1

2008-01-17 Thread Steven Rostedt

On Thu, 17 Jan 2008, Daniel Walker wrote:


 On Thu, 2008-01-17 at 19:17 +0100, Wolfgang Grandegger wrote:
  [0.733248] TCP bind hash table entries: 2048 (order: 3, 57344
  bytes)
  [0.741132] TCP: Hash tables configured (established 2048 bind
  2048)
  [0.747981] TCP reno registered
  [0.805896] krcupreemptd setsched 0
  [0.809657]   prio = 98

 That's interesting .. You chould try running cyclictest at priority 99
 to eliminate other threads that might get involved (using -p99 instead
 of -p80 , I think) ..

No that prio is the internal prio where smaller number is higher priority.
The krcupreemptd runs at RT prio 1, which is 98 internally.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc8-rt1

2008-01-17 Thread Steven Rostedt

On Thu, 17 Jan 2008, Daniel Walker wrote:


 On Thu, 2008-01-17 at 19:17 +0100, Wolfgang Grandegger wrote:
  [0.733248] TCP bind hash table entries: 2048 (order: 3, 57344
  bytes)
  [0.741132] TCP: Hash tables configured (established 2048 bind
  2048)
  [0.747981] TCP reno registered
  [0.805896] krcupreemptd setsched 0
  [0.809657]   prio = 98

 That's interesting .. You chould try running cyclictest at priority 99
 to eliminate other threads that might get involved (using -p99 instead
 of -p80 , I think) ..


But, anotherthing to try is disabling CONFIG_PREEMPT_RCU_BOOST, and see if
that fixes anything.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt2

2008-01-16 Thread Steven Rostedt

   On Tue, 15 Jan 2008 02:37:37 +0200, =?utf-8?q?S=2E=C3=87a=C4=9Flar?= Onur 
   said:
And because of mcount-add-basic-support-for-gcc-profiler-instrum.patch, 
closed
source nvidia-new module cannot be used with this release (mcount is 
exported
GPL only), i know this is not supported but i used it with that [2] 
patch up
until now without a single problem.
  
   Playing devil's advocate here - the claim is that EXPORT_SYMBOL_GPL is to
   indicate that code is getting too chummy with Linux internals.
  
   However, in *this* case, isn't it code that is too chummy with *GCC* 
   internals,
   and thus it isn't our place to say what can and can't be done with code 
   that
   is derivative of the GCC compiler? ;)
 
  Actually, it got put in there by accident. I usually default all my
  exports as GPL.  But this breaks pretty much everything, so I'll leave it
  as EXPORT_SYMBOL.

 OK, I can live with that. ;)


We modified mcount now, and it is derived from an objdump of glibc. So
this is most definitely a derived work from glibc. But glibc is licensed
as LGPL, which IIRC allows for non GPL to link to it.

I personally could care less if we use EXPORT_SYMBOL or EXPORT_SYMBOL_GPL.
But I really want to do The Right Thing(tm). I'm not a lawyer and don't
claim that I know anything about the law, but I'm leaning towards the non
_GPL version because the code was from LGPL and not from strict GPL.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt2

2008-01-16 Thread Steven Rostedt

On Wed, 16 Jan 2008, Steven Rostedt wrote:

 We modified mcount now, and it is derived from an objdump of glibc. So
 this is most definitely a derived work from glibc. But glibc is licensed
 as LGPL, which IIRC allows for non GPL to link to it.

 I personally could care less if we use EXPORT_SYMBOL or EXPORT_SYMBOL_GPL.
 But I really want to do The Right Thing(tm). I'm not a lawyer and don't
 claim that I know anything about the law, but I'm leaning towards the non
 _GPL version because the code was from LGPL and not from strict GPL.

Sorry folks, I'm going to stick with the _GPL version. It doesn't mean
that you can't still load your nVidia module into -rt. I just means you
can't turn on function trace and then load it. Well, you might if you
don't compile the nVidia wrapper against it with function trace on.

The reason simply is to cover my butt.  By limiting it to GPL, I'm fine.
Even if the original author didn't care. But by opening it up to external
prorietary modules, I may be considered infringing on the license.

So, unless I hear from a lawyer that is willing to back me up on a non
_GPL export publically, the mcount function will stay as an
EXPORT_SYMBOL_GPL.

Note: There is a definite reason for this change. The previous version
of mcount was written by Ingo Molnar, and he added the export. I've
changed mcount to be closer to the glibc code (which I derived it from),
so the change in EXPORT type is legitimate.

-- Steve
-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt1

2008-01-16 Thread Steven Rostedt

On Mon, 14 Jan 2008, Robert Schwebel wrote:

 On Sun, Jan 13, 2008 at 03:48:33PM -0500, Steven Rostedt wrote:
  Yeah, I didn't make any attempt to fix latency_tracing for PPC on this
  release. I expected as much. I do have a couple of PPC boxes that I
  can play with, and I'll see if I can get that working too before -rt2.
  But I want to get a stable x86 release out fast.

 It compiles now, although I still don't get any kernel output any more
 if I enable CONFIG_FUNCTION_TRACE. Sorry for not being more verbose, I'm
 out of office right now and my arms are not long enough to connect the
 BDI ;)

Hmm, I wonder if the bootup is calling mcount before mcount is ready. This
could be the issue.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [BUG] Oops while running Specjbb on -rt kernel

2008-01-16 Thread Steven Rostedt

Peter,

Could you take a look at this?

-- Steve


On Mon, 14 Jan 2008, Chirag Jog wrote:

 Hi,
 On Running Specjbb on the latest kernel [2.6.24-rc7-rt1], I
 get the following oops.

 This is reproducible easily. I have seen it on the
 2.6.24-rc5-rt1.
 Although I didn't see it on 2.6.23.11-rt14.
 Also I haven't seen it on 2.6.24-rc6 [Mainline].


 Unable to handle kernel NULL pointer dereference at 0060 RIP:
  [8022d353] pick_next_task_fair+0x2d/0x3f
 PGD 236ce9067 PUD 23851d067 PMD 237cf4067 PTE 0
 Oops:  [1] PREEMPT SMP
 CPU 2
 Modules linked in: ipv6 autofs4 i2c_dev i2c_core hidp rfcomm l2cap bluetooth 
 sunrpc dm_mirror dm_multipath dm_mod sbs sbshc dock battery ac power_supply 
 parport_pc lp parport sg bnx2 button serio_raw shpchp k8temp rtc_cmos 
 rtc_core pcspkr hwmon rtc_lib mptsas mptscsih mptbase scsi_transport_sas 
 sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci_hcd
 Pid: 3905, comm: java Not tainted 2.6.24-rc7-rt1 #1
 RIP: 0010:[8022d353]  [8022d353] 
 pick_next_task_fair+0x2d/0x3f
 RSP: 0018:8101d2c7bc48  EFLAGS: 00210046
 RAX:  RBX:  RCX: 8101d2c794e0
 RDX: 0004 RSI: 0001 RDI: 81000e8a98e0
 RBP: 8101d2c7bc58 R08: 81000e8a5a40 R09: 0002
 R10: 810236c7d1a0 R11: 0001 R12: 81000e8a5a40
 R13: 0001 R14: 81000e8a9880 R15: 0001000481a6
 FS:  2ac153330090() GS:81023fc057c0(0063) knlGS:97effb90
 CS:  0010 DS: 002b ES: 002b CR0: 8005003b
 CR2: 0060 CR3: 00023981d000 CR4: 06e0
 DR0:  DR1:  DR2: 
 DR3:  DR6: 0ff0 DR7: 0400
 Process java (pid: 3905, threadinfo 8101d2c7a000, task 8101d2c794e0)
 Stack:  8022ced1 8101d2c794e0 8101d2c7bcf8 80486345
  8101d2c7bcc0  8101d2c7be08 81023f526040
  8101d2c794e0 80257b15 8101d2c79760 000200402040
 Call Trace:
  [8022ced1] put_prev_task_rt+0xd/0x18
  [80486345] __schedule+0x3d6/0x6fc
  [80257b15] __rt_mutex_adjust_prio+0x11/0x24
  [80258285] task_blocks_on_rt_mutex+0x103/0x1bf
  [8048697d] schedule+0xdf/0xff
  [804874d8] rt_mutex_slowlock+0x1c3/0x29d
  [80487175] rt_mutex_lock+0x28/0x2a
  [8025850f] __rt_down_read+0x47/0x4b
  [80258529] rt_down_read+0xb/0xd
  [80256c8f] do_futex+0x753/0xb1d
  [8020a76e] __switch_to+0x11a/0x2a0
  [80232b8b] finish_task_switch+0x2b/0x98
  [8025758c] compat_sys_futex+0xd8/0xf6
  [8020f573] syscall_trace_enter+0x95/0x99
  [80225534] cstar_do_call+0x1b/0x65


 Code: 48 8b 7b 60 48 85 ff 75 e0 48 8d 43 b8 41 58 5b c9 c3 55 48
 RIP  [8022d353] pick_next_task_fair+0x2d/0x3f
  RSP 8101d2c7bc48
 CR2: 0060


 Here is the config file.


[snipped - see linux-rt-users]


-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt2

2008-01-16 Thread Steven Rostedt

On Tue, 15 Jan 2008, Mariusz Kozlowski wrote:
 Ok. It works.

 I found this in dmesg:

 BUG: swapper:0 task might have lost a preemption check!
 Pid: 0, comm: swapper Not tainted 2.6.24-rc7-rt2 #3
  [c010386b] show_trace_log_lvl+0x1d/0x3b
  [c01042f3] show_trace+0x12/0x14
  [c0104a2f] dump_stack+0x6a/0x70
  [c0115419] preempt_enable_no_resched+0x5c/0x5e

This is really really strange. cpu_idle calls __preempt_enable_no_resched
and not preempt_enable_no_resched (notice the prefixed underscores).
So I don't know how you got that output. Did you get any strance rejects
in applying this patch?

-- Steve


  [c0100e35] cpu_idle+0x6d/0x82
  [c0323b6e] rest_init+0x66/0x68
  [c043aba6] start_kernel+0x20c/0x276
  [] 0x0
  ===
 ---
 | preempt count:  ]
 | 0-level deep critical section nesting:
 

 Box runs fine though.

 Regards,

   Mariusz
 -
 To unsubscribe from this list: send the line unsubscribe linux-rt-users in
 the body of a message to [EMAIL PROTECTED]
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt2

2008-01-16 Thread Steven Rostedt

Thomas, can you look at this. He's getting APIC errors on bootup. I'm
wondering if this isn't another strange anomaly of this controller.

He also states that he doesn't get this with the non-rt kernel.


 -rt3 on top of 2.6.24-rc8 works fine without that sysfs problem (acpi warnings
 still there and full dmesg can be found from [1]), whatever causes this seems
 solved :)

 [1] http://cekirdek.pardus.org.tr/~caglar/dmesg.rt3

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt2 [PATCH] latency tracer fix for ppc32

2008-01-16 Thread Steven Rostedt


On Wed, 16 Jan 2008, Luotao Fu wrote:

 I found out that the tracer got stuck on ppc32 platforms because some early
 functions call _mcount before mcount_enabled is initialized at all. I made a
 patch, which marks these functions as notrace to solve this problem. With this
 patch I can successfully boot up our mpc5200b platform and make latency trace.
 (tested with -b switch in cyclictest). Please comment.

 I made my patch against the -rt2 tree since the dummy call early_printk() in
 -rt3 conflicts with our implementation of a functional early_printk(). It
 should also work with -rt3 though.


Thanks, applied.

But for future reference, if you attach your patch please name it with the
ending of .patch and not .diff. Also do it at a -p1 level and not -p0.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.24-rc8-rt1

2008-01-16 Thread Steven Rostedt
We are pleased to announce the 2.6.24-rc8-rt1 tree, which can be
downloaded from the location:

  http://rt.et.redhat.com/download/

Information on the RT patch can be found at:

  http://rt.wiki.kernel.org/index.php/Main_Page

Changes since 2.6.24-rc7-rt3

  - ported to 2.6.24-rc8

  - PPC bootup notrace added for function trace (Luotao Fu)

  - MIPS remove duplicate Kconfig (Frank Rowand)

to build a 2.6.24-rc8-rt1 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.tar.bz2
  http://www.kernel.org/pub/linux/kernel/v2.6/testing/patch-2.6.24-rc8.bz2
  http://rt.et.redhat.com/download/patch-2.6.24-rc8-rt1.bz2


And like always, my RT version of Matt Mackall's ketchup will get this
for you nicely:

  http://people.redhat.com/srostedt/rt/tools/ketchup-0.9.8-rt3


The broken out patches are also available.

-- Steve



-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt2

2008-01-15 Thread Steven Rostedt


On Tue, 15 Jan 2008, Luotao Fu wrote:

 Compiling with tracing function turned seemed still to be broken to me:

Could you email me your config (privately)

 kernel/built-in.o: In function `early_printk_name':
 latency_trace.c:(.text+0x3cd18): undefined reference to `early_printk'
 latency_trace.c:(.text+0x3cd40): undefined reference to `early_printk'
 kernel/built-in.o: In function `early_print_entry':
 latency_trace.c:(.text+0x3cd80): undefined reference to `early_printk'
 latency_trace.c:(.text+0x3cdd8): undefined reference to `early_printk'
 latency_trace.c:(.text+0x3cdfc): undefined reference to `early_printk'

 We made a early_printk patch based on the 8250_early for powerpc 32bit here.
 (still absolutely untested and incomplete, we will post it asap). I added
 early_printk() and exported _mcount() and could compile succesfully. However 
 the
 kernel will not boot. I attached a bdi to it and found out that it stucks at
 _mcount in arch/powerpc/kernel/entry_32.S at about line 1080, where the 
 variable
 mcount_enabled is loaded and checked. Obviously the variable is still not 
 valid
 at the time of check. To check this I took out the lines like above

 --- arch/powerpc/kernel/entry_32.S.orig 2008-01-15 17:01:25.0 +0100
 +++ arch/powerpc/kernel/entry_32.S  2008-01-15 17:17:18.0 +0100
 @@ -1075,9 +1075,6 @@
 stw r6, 24(r1)
 mflrr3  /* will use as first arg to __trace() */
 mfcrr4
 -   lis r5,[EMAIL PROTECTED]
 -   lwz r5,[EMAIL PROTECTED](r5)
 -   cmpwi   r5,0
 stw r3, 44(r1)  /* lr */
 stw r4,  8(r1)  /* cr */
 stw r7, 28(r1)

 After recompiling the kernel started normally and it seems to work. I was even
 able to make a trace with cyclictest. However there were several crashes 
 (which
 might be caused by other problems). I still have to take a closer look.

 I am just wondering why the check for mcount_enabled is there at all and I 
 think
 that there due to be some better fixes than just throw it out ;-). On the 
 other
 side, I just can't find in which way mcount_enabled is used in the tracer at
 all. Could you give me some hints on this one?


The way we turn on and off mcount function calls at run time is through
mcount_enable.  I'll look into why this is broken for you. I'm able to run
with this. My box is a ppc64, and my ppc32 (powerbook) wont come close to
booting RT (never did).  On boot up it locks up right away and the screen
looks like it starts to melt. No serial, so I don't have much to debug
that with.

Anyway, I'll take a deeper look into this.

Thanks,

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt2

2008-01-15 Thread Steven Rostedt

On Tue, 15 Jan 2008 [EMAIL PROTECTED] wrote:

 On Tue, 15 Jan 2008 02:37:37 +0200, =?utf-8?q?S=2E=C3=87a=C4=9Flar?= Onur 
 said:
  And because of mcount-add-basic-support-for-gcc-profiler-instrum.patch, 
  closed
  source nvidia-new module cannot be used with this release (mcount is 
  exported
  GPL only), i know this is not supported but i used it with that [2] patch up
  until now without a single problem.

 Playing devil's advocate here - the claim is that EXPORT_SYMBOL_GPL is to
 indicate that code is getting too chummy with Linux internals.

 However, in *this* case, isn't it code that is too chummy with *GCC* 
 internals,
 and thus it isn't our place to say what can and can't be done with code that
 is derivative of the GCC compiler? ;)

Actually, it got put in there by accident. I usually default all my
exports as GPL.  But this breaks pretty much everything, so I'll leave it
as EXPORT_SYMBOL.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.24-rc7-rt2

2008-01-14 Thread Steven Rostedt
We are pleased to announce the 2.6.24-rc7-rt2 tree, which can be
downloaded from the location:

  http://rt.et.redhat.com/download/

Information on the RT patch can be found at:

  http://rt.wiki.kernel.org/index.php/Main_Page

Changes since 2.6.24-rc7-rt1

  - Several merge fixes reported by:
Mariusz Kozloski

  - Removal of kvm-rt.patch (it's so old it is now bogus)

  - PPC compile fix (reported by: Robert Schwebel)

  - Remove of running softirq by hardirq (too dangerous)

  - Changed BUG_ON in filemap from atomic to pagefault disabled
   (Steven Rostedt)

Note: There are still some fixes that are not incorporated here yet.


to build a 2.6.24-rc7-rt2 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.tar.bz2
  http://www.kernel.org/pub/linux/kernel/v2.6/testing/patch-2.6.24-rc7.bz2
  http://rt.et.redhat.com/download/patch-2.6.24-rc7-rt2.bz2


And like always, my RT version of Matt Mackall's ketchup will get this
for you nicely:

  http://people.redhat.com/srostedt/rt/tools/ketchup-0.9.8-rt3


The broken out patches are also available.

-- Steve



-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt1

2008-01-14 Thread Steven Rostedt

On Mon, 14 Jan 2008, Robert Schwebel wrote:


 It compiles now, although I still don't get any kernel output any more
 if I enable CONFIG_FUNCTION_TRACE. Sorry for not being more verbose, I'm
 out of office right now and my arms are not long enough to connect the
 BDI ;)


I have a strange anomaly going on with my box. It seems to get stuck, but
if I keep hitting the keyboard (serial console), it continues. Seems that
a timer is stuck.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt2

2008-01-14 Thread Steven Rostedt

On Mon, 14 Jan 2008, Mariusz Kozlowski wrote:

 Hello,

  We are pleased to announce the 2.6.24-rc7-rt2 tree, which can be
  downloaded from the location:
 
http://rt.et.redhat.com/download/

 Compiles fine but I run into another problem. On startup kernel oopses
 early and this 'oops' loops over and over again on the screen until you
 shut down the box. So I can't (easily) capture the output. I tried different
 boot_delay values but it seems that it doesn't work as intended. Seems that
 it delays only the first printk from oops, then the rest loops over the screen
 without delay - again it becomes unreadable. I used my usual config with a 
 bunch
 of debug options enabled so that maybe is not real rt system case. Also tried 
 to
 bisect it with quilt but that is hard. Lots of patches depends on another 
 patches
 so bisection brakes things. I'll look into this tommorow and reply if I get
 anything substantial.

 I attached my config in case anybody wants to try to reproduce that.


Turn off LOCKDEP and see if you still have the same problems. This might
be fixed with Daniel's patch.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt1: macro put_cpu_var passed 2 arguments, but takes just 1

2008-01-13 Thread Steven Rostedt


On Sun, 13 Jan 2008, Mariusz Kozlowski wrote:

 [... snip ...]

 @@ -56,12 +64,13 @@ static inline void __quicklist_free(int
 struct page *page)
  {
 struct quicklist *q;
 +   int cpu;

 -   q = get_cpu_var(quicklist)[nr];
 +   q = get_cpu_var_locked(quicklist, cpu)[nr];
 *(void **)p = q-page;
 q-page = p;
 q-nr_pages++;
 -   put_cpu_var(quicklist);
 +   put_cpu_var(quicklist, cpu); - should that be 
 put_cpu_var_locked()?
  }


Ouch! looks at rejs

*** static inline void __quicklist_free(int
*** 76,86 
return;
}

-   q = get_cpu_var(quicklist)[nr];
*(void **)p = q-page;
q-page = p;
q-nr_pages++;
-   put_cpu_var(quicklist);
  }

  static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp)
--- 73,83 
return;
}

+   q = get_cpu_var_locked(quicklist, cpu)[nr];
*(void **)p = q-page;
q-page = p;
q-nr_pages++;
+   put_cpu_var_locked(quicklist, cpu);
  }

  static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp)


Darn, that was my fault. OK, will release a -rt2 soon.

Thanks,

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.24-rc7-rt1

2008-01-13 Thread Steven Rostedt

On Sun, 13 Jan 2008, Robert Schwebel wrote:

 On Sun, Jan 13, 2008 at 02:00:01PM -0500, Steven Rostedt wrote:
  We are pleased to announce the 2.6.24-rc7-rt1 tree

 Works fine on phyCORE-MPC5200B-tiny, here are the latest results:
 http://www.pengutronix.de/oselas/realtime/results/20080113-1/

That doesn't look too bad. Or is over a 100us not good for that box?

Also is it SMP?



 However, when I try to switch on the latency tracer, it ends with the
 warnings below. I've added early_printk() stubs in order to make it
 link at all.

Yeah, I didn't make any attempt to fix latency_tracing for PPC on this
release. I expected as much. I do have a couple of PPC boxes that I can
play with, and I'll see if I can get that working too before -rt2. But I
want to get a stable x86 release out fast.

Thanks,

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Question: RT patch and CFS patch ?

2008-01-04 Thread Steven Rostedt

On Wed, 2 Jan 2008, Andrzej Walczak wrote:

 Dear All,

 I have question:

 Will RT kernel have any benefits from improved CFS scheduler ?

 or maybe this question in other words

 Does it make sense to patch kernel with CFS patch
 (http://people.redhat.com/mingo/cfs-scheduler/sched-cfs-v2.6.23.12-v24.1.patch)
 or other relevant  before/after patching with RT patch ?


2.6.24-rc-rt should have the latest upstream CFS stuff plus some other
goodies.  2.6.23-rt is still the older CFS scheduler.  I'm currently
trying to get Ingo's backport to run on .23-rt. But I wont release that
with CFS until I get it stable.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: loadavg vs loadavgrt

2007-12-20 Thread Steven Rostedt

On Thu, 20 Dec 2007, Jaswinder Singh wrote:

 Hello,

 By linux-2.6.23-rt3 on ideal condition, I am getting :-

 [EMAIL PROTECTED]:~# cat /proc/loadavgrt
 1.00 1.00 1.00 0/52 1158

This is bogus. It's a known issue, and there's a patch in the works.
The problem is that the calculation of load average is done via a softirq,
which in vanilla Linux is not a thread (nor an RT task). But in RT, all
softirqs are RT tasks, and when this calculation is performed, it sees
that there is always an RT task runnning. That RT task that it sees
running happens to be the softirq RT task that is calculating the load!

So the numbers are bogus!

-- Steve


 [EMAIL PROTECTED]:~# cat /proc/loadavg
 0.00 0.00 0.02 1/52 1159
 [EMAIL PROTECTED]:~#

 I am curious:
 1. Why loadavgrt is not reflecting load on loadavg

 2. loadavgrt is giving correct information or it is bogus

 Thank you,

 Jaswinder Singh.

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] sched: dynamically update the root-domain span/online maps

2007-12-18 Thread Steven Rostedt

Ingo Molnar wrote:

well since i reverted the original patch, there's no regression. The 
question is, do we know whether this new patch works fine wrt. s2ram?




Hi Ingo,

I included the same patches into 2.6.23.9-rt13 and someone reported s2r 
failed for them. I've included Greg's updates into a pre release of 
-rt14 and sent that to the reporter. I'm waiting on a response before 
releasing -rt14.


Although I did just get a response from Andrew Morton saying that the 
updated patch fixed his box.


-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.24-rc5-rt1

2007-12-14 Thread Steven Rostedt
We are pleased to announce the 2.6.24-rc5-rt1 tree, which can be
downloaded from the usual location:

 http://www.kernel.org/pub/linux/kernel/projects/rt/

Changes since 2.6.24-rc2-rt1

  - Ported to 2.6.24-rc5

  - Backported the new RT Balancing code from sched-devel
   New changes by Steven Rostedt, Gregory Haskins,
Ingo Molnar, and Dmitry Adamushko

  - 2 dimension CPU Prio RT balancing search (Gregory Haskins)

  - ARM compile fix (Kevin Hilman)

  - Handle IRQ_PENDING for simple irq thread (Steven Rostedt)

  - latency tracer updates (Daniel Walker)

  - Remove warning in local_bh_enable (Kevin Hilman)

  - use real time pcp locking for page draining during cpu (Andi Kleen)

  - Revert lazy disable irq from simple irq handler (Steven Rostedt)

  - AT91 switch to edge from simple irq (Remy Bohmer)

  - POWER hacks to let kernbench run (Paul McKenney)

  - RCU Preempt extra variable fix (Remy Bohmer)

to build a 2.6.24-rc5-rt1 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.tar.bz2
  http://www.kernel.org/pub/linux/kernel/v2.6/testing/patch-2.6.24-rc5.bz2
  http://www.kernel.org/pub/linux/kernel/projects/rt/patch-2.6.24-rc5-rt1.gz


And like always, my RT version of Matt Mackall's ketchup will get this
for you nicely:

  http://people.redhat.com/srostedt/rt/tools/ketchup-0.9.8-rt2


The broken out patches are also available.

-- Steve



-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/6] RCU: Preemptible-RCU

2007-12-13 Thread Steven Rostedt
On Thu, 13 Dec 2007, Gautham R Shenoy wrote:


 Currently it is based against the latest linux-2.6-sched-devel.git

 Awaiting your feedback!

Hi Gautham,

Thanks for posting this. I believe this is the same version of preempt RCU
as we have in the RT patch. It seems to be very stable. I ran the RT patch
version of the RCU Preempt (just the Preempt RCU patches without RT on
latest git) on a 64way box and the results seems just as good (if not
slightly better) than classic RCU!  I'll rerun this patch series on that
box and post the results.

From what I'm seening with this, is that it is ready for mainline. These
patches should probably go into -mm and be ready for 2.6.25.  If Andrew
wants to wait for my results, I'll run them tonight.

Thanks Gautham, Paul and Dipankar for all this great work!

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23.9-rt13

2007-12-13 Thread Steven Rostedt

Hi Remy,

This would have been better to send this as two separate
patches. They are two different issues.

On Thu, 13 Dec 2007, Remy Bohmer wrote:
 If I compile -rt13 I get some compile warnings on ARM (AT91):
 1) This one did not exist in rt1:
 In file included from kernel/sched.c:911:
 kernel/sched_rt.c: In function 'dec_rt_tasks':
 kernel/sched_rt.c:88: warning: unused variable 'highest_prio'

This is from Gregory Haskins' patch. He forgot to compile check for
warnings on UP again ;-)

 2)  This one is there already for a much longer time:
 CC  kernel/rcupreempt.o
 kernel/rcupreempt.c:1001: warning: 'per_cpu__rcu_dyntick_snapshot'
 defined but not used

This was a merge conflict being solved incorrectly.


 Both warnings are fixed by the attached patch, but warning 2 needs some 
 review.
 This var is defined twice in this file, 1 in the NO_HZ ifdef, and 1
 which seems to be not used.

Yeah, I'll look into it since I wrote that usage ;-)

Greg,

Can you merge the first part into your patch and resend it to me.

I'll look at the second one.

Thanks,

-- Steve
-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RT] Revert Softdisable for simple irqs.

2007-12-12 Thread Steven Rostedt

In commit 76d2160147f43f982dfe881404cfde9fd0a9da21 lazy irq disabling
was implemented, and the simple irq handler had a masking set to it.

Remy Bohmer discovered that some devices in the ARM architecture
would trigger the mask, but never unmask it. His patch to do the
unmasking was questioned by Russell King about masking simple irqs
to begin with. Looking further, it was discovered that the problems
Remy was seeing was due to improper use of the simple handler by
devices, and he later submitted patches to fix those. But the issue
that was uncovered was that the simple handler should never mask.

This patch reverts the masking in the simple handler.

[Note: This version is for the RT patch, and the IRQ_PENDING is needed
 for threaded IRQs]

Signed-off-by: Steven Rostedt [EMAIL PROTECTED]

---
 kernel/irq/chip.c |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Index: linux-2.6.23.9-rt13/kernel/irq/chip.c
===
--- linux-2.6.23.9-rt13.orig/kernel/irq/chip.c
+++ linux-2.6.23.9-rt13/kernel/irq/chip.c
@@ -302,10 +302,9 @@ handle_simple_irq(unsigned int irq, stru
action = desc-action;
if (unlikely(!action || (desc-status  (IRQ_INPROGRESS |
 IRQ_DISABLED {
-   if (desc-chip-mask)
-   desc-chip-mask(irq);
desc-status = ~(IRQ_REPLAY | IRQ_WAITING);
-   desc-status |= IRQ_PENDING;
+   if (action  (desc-status  IRQ_INPROGRESS))
+   desc-status |= IRQ_PENDING;
goto out_unlock;
}

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] handle IRQ_PENDING for simple irq handler

2007-12-12 Thread Steven Rostedt
Note, that should have been [PATCH RT], this is NOT for mainline!

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] handle IRQ_PENDING for simple irq handler

2007-12-12 Thread Steven Rostedt

With the IO-APIC pcix hack (level=edge masking), we can receive
interrupts while masked. But these interrupts might be missed.

Also, normal simple interrupts might be missed too on leaving of
thread handler.

Signed-off-by: Steven Rostedt [EMAIL PROTECTED]

Index: linux-2.6.21-rt-hack/kernel/irq/manage.c
===
--- linux-2.6.21-rt-hack.orig/kernel/irq/manage.c
+++ linux-2.6.21-rt-hack/kernel/irq/manage.c
@@ -628,14 +628,17 @@ static void thread_simple_irq(irq_desc_t
unsigned int irq = desc - irq_desc;
irqreturn_t action_ret;

-   if (action  !desc-depth) {
+   do {
+   if (!action || desc-depth)
+   break;
+   desc-status = ~IRQ_PENDING;
spin_unlock(desc-lock);
action_ret = handle_IRQ_event(irq, action);
cond_resched_hardirq_context();
spin_lock_irq(desc-lock);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
-   }
+   } while (desc-status  IRQ_PENDING);
desc-status = ~IRQ_INPROGRESS;
 }



-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RT] Revert Softdisable for simple irqs.

2007-12-12 Thread Steven Rostedt

On Wed, 12 Dec 2007, Remy Bohmer wrote:
 Also:
 Acked-by: Remy Bohmer [EMAIL PROTECTED]

 Thanks for the effort also, I still had it on my todo list, but that
 is needed anymore...

No problem. Could you also ACK the one I sent for mainline.

Thanks,

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RT] add pending in fasteoi for IOAPIC hack

2007-12-12 Thread Steven Rostedt

The IOAPIC hack that does a level=edge to mask does not disable
interrupts. So we can receive interrupts when masked, and this means
that we can miss interrupts that arrive when the thread is handling
them.

This patch adds the IRQ_PENDING logic of the edge irqs to be
able to catch interrupts coming in when supposably masked.

Signed-off-by: Steven Rostedt [EMAIL PROTECTED]

Index: linux-2.6.21-rt-hack/kernel/irq/chip.c
===
--- linux-2.6.21-rt-hack.orig/kernel/irq/chip.c
+++ linux-2.6.21-rt-hack/kernel/irq/chip.c
@@ -398,8 +398,10 @@ handle_fasteoi_irq(unsigned int irq, str

spin_lock(desc-lock);

-   if (unlikely(desc-status  IRQ_INPROGRESS))
+   if (unlikely(desc-status  IRQ_INPROGRESS)) {
+   desc-status |= IRQ_PENDING;
goto out;
+   }

desc-status = ~(IRQ_REPLAY | IRQ_WAITING);
kstat_cpu(cpu).irqs[irq]++;


-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH]: Atmel Serial Console interrupt handler splitup

2007-12-12 Thread Steven Rostedt

[ keeping full email for Ingo and Thomas ]

On Fri, 7 Dec 2007, Remy Bohmer wrote:

 Hello Andrew,

 Attached I have put 3 patches for the AT91-series for the serial port
 driver on AT91.
 The interrupt handler of the serial console is quite long, and if
 Preempt-RT is used on AT91 the interrupt handler is even doing illegal
 things. These illegal things are related to the DBGU port of which the
 interrupt handler is shared with the system_interrupt which
 automagically runs in IRQF_NODELAY context due to the timer interrupt.
 The old interrupt handler calls some kernel interfaces that can block
 on a mutex.

 To solve this, I have split up the interrupt handler in a IRQF_NODELAY
 safe/unsafe part, with as result that the code running in real
 interrupt context is shortened a lot (the tty driver part is now
 called from a tasklet). On AT91 David Brownell noticed several months
 ago that the DBGU can miss some characters on NO_HZ. I would expect
 that this would be better now due to the shorter interrupt handler,
 although it was not my goal to solve it with these patches. (David can
 you verify if this is better now?)

 So, I have here 3 patches:
 * atmel_serial_cleanup - This patch adapts the driver to the coding
 rules, splits the interrupt handler into 3 routines (cuts the routine
 in more readable pieces) , but there is no functional change involved.
 * atmel_serial_irq_splitup - This patch splits up the interrupt handler.
 * atmel_serial_irqf_nodelay - This patch is additionally required to
 get it properly working on Preempt-RT. (This patch should thus go into
 the RT-patch, AFTER integration of the other 2 patches into mainline)

 BUT: I based the patch on the 2.6.23.1 + your patch collection at
 http://maxim.org.za/AT91RM9200/2.6/2.6.23-at91.patch.gz

 I did this because this driver conflicts with the DMA(PDC) patches
 that are in the patchset on  maxim.org.za. I found out that these DMA
 patches are still not integrated into mainline, although it is in your
 patchset for several kernel releases. I can make a series based on
 kernel mainline, but that would harden the integration for you in your
 patchset.
 The patchset itself is not dependant on the DMA changes, so I can
 split it up, but the DMA changes itself are quite heavy.

 But there is also a relation with Preempt-RT. To get the patch in
 preempt RT the other patches has to be in mainline, so things are
 stacking up now.

Not really.


 What is wise here? should I create a new patchset for mainline? Or can
 you push the DMA patch also to mainline together with this set? I have
 it working here for months, so I see no reason not to, maybe you have
 a reason?

 I tested it on AT91rm9200-EK (+proprietary boards) + AT91SAM9261-EK,
 in combination with: 2.6.23.1 and 2.6.23.1-rt5 up to serial speed to
 115200 (also with 99% CPU load on prio 99 on RT, no missing characters
 detected.)

 Note: Preempt-RT CANNOT run without these patches on AT91.

I could pull all the patches into RT (although I would like Thomas to take
a look first and give an OK). And then apply your patches on top. I'm
assuming that this only affects the ARM architecture and the AT91 device?

I'm leaving this out for -rt13 and for the next cut of 2.6.24-rc-rt. But
if Thomas is OK with pulling in the external patch queue, I'll do it for
-rt14.

It is best if the patch queue in question makes it into mainline.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.23.9-rt13

2007-12-12 Thread Steven Rostedt
We are pleased to announce the 2.6.23.9-rt13 tree, which can be
downloaded from the location:

 http://www.kernel.org/pub/linux/kernel/projects/rt/

Changes since 2.6.23.9-rt12

  - Backported the new RT Balancing code from sched-devel
   New changes by Steven Rostedt, Gregory Haskins,
Ingo Molnar, and Dmitry Adamushko

  - 2 dimension CPU Prio RT balancing search (Gregory Haskins)

  - ARM compile fix (Kevin Hilman)

  - Disable HPET legacy replacement for kdump (OGAWA Hirofumi)

  - disable HPET on shutdown (OGAWA Hirofumi)

  - fix for futex_wait signal stack corruption (Steven Rostedt)

  - Handle IRQ_PENDING for simple irq thread (Steven Rostedt)

  - latency tracer updates (Daniel Walker)

  - Remove warning in local_bh_enable (Kevin Hilman)

  - use real time pcp locking for page draining during cpu (Andi Kleen)

  - Revert lazy disable irq from simple irq handler (Steven Rostedt)

  - AT91 switch to edge from simple irq (Remy Bohmer)


to build a 2.6.23.9-rt13 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.9.tar.bz2 
  http://www.kernel.org/pub/linux/kernel/projects/rt/patch-2.6.23.9-rt13.bz2

And like always, my RT version of Matt Mackall's ketchup will get this
for you nicely:

  http://people.redhat.com/srostedt/rt/tools/ketchup-0.9.8-rt2


The broken out patches are also available.

-- Steve


-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH Latency Tracer] don't panic on failed bootmem alloc

2007-12-05 Thread Steven Rostedt
Ingo,

This patch prevents a panic on a failed bootmem alloc in the
initialization of the tracer buffers.

Signed-off-by: Steven Rostedt [EMAIL PROTECTED]

Index: linux-2.6-latency/kernel/latency_trace.c
===
--- linux-2.6-latency.orig/kernel/latency_trace.c
+++ linux-2.6-latency/kernel/latency_trace.c
@@ -2720,10 +2720,11 @@ void * __init tracer_alloc_bootmem(unsig
 {
void * ret;
 
-   ret =__alloc_bootmem(size, SMP_CACHE_BYTES, ARCH_LOW_ADDRESS_LIMIT);
+   ret =__alloc_bootmem_nopanic(size, SMP_CACHE_BYTES,
+ARCH_LOW_ADDRESS_LIMIT);
if (ret != NULL  ((unsigned long)ret)  ARCH_LOW_ADDRESS_LIMIT) {
free_bootmem(__pa(ret), size);
-   ret = __alloc_bootmem(size,
+   ret = __alloc_bootmem_nopanic(size,
SMP_CACHE_BYTES,
__pa(MAX_DMA_ADDRESS));
}


-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH PREEMPT_RT]: On AT91 ARM: GPIO Interrupt handling can/will stall forever

2007-11-29 Thread Steven Rostedt

On Thu, 29 Nov 2007, Russell King - ARM Linux wrote:

 On Thu, Nov 29, 2007 at 12:27:30PM +0100, Remy Bohmer wrote:

 Ah, and looking at the changes to the file, the addition of the mask
 and unmask was done by someone who didn't understand what this was
 trying to do.  So that change should be backed out.


Perhaps only part of the change should be backed out.  The part that masks
the irq in the handle_simple_irq code.

That's from commit 76d2160147f43f982dfe881404cfde9fd0a9da21 which is to
not disable an irq line when disable_irq is called.  A form of lazy
disable irq.

This speeds up code that uses disable_irq, since the line is only masked
when an interrupt actually arrives. Using disable_irq / enable_irq does no
IRQ chip modifications if an interrupt from the IRQ line does not arrive
between the two.

Now the question is, can something that uses handle_simple_irq call
disable_irq?  If there is no mask function, I would assume that this would
be a noop in such a case. If this is true, then we could remove the mask
from handle_simple_irq. But then we might want to add a BUG() in
disable_irq for simple_irqs.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH PREEMPT_RT]: On AT91 ARM: GPIO Interrupt handling can/will stall forever

2007-11-29 Thread Steven Rostedt

On Thu, 29 Nov 2007, Remy Bohmer wrote:

 I changed the interrupt handler from the simple_irq to the edge_irq,
 and it works...!!
 (I added a noop routine for that .ack part, because there is no ack)

 I believe I was too focussed on the masking bug in the RT kernel on
 the simple_irq() that I did not see that for the AT91 series the edge
 type interrupt handler also works... (even better...) What I thought
 was 1 single bug in the RT-kernel turned out to be a number of things
 together that aren't correct, even for mainline.

 So, to come to a conclusion: The masking bug in RT is still there in
 the simple_irq path, and masking has to be removed from the simple_irq
 code. Also for mainline. AT91 can live without simple_irq.
 I think we are in sync again...

 I will post a patch for the AT91 later on, after some more testing.


Remy,

Thanks a lot for figuring this out!! Makes me feel better for the NACK ;-)

If you also want to send me a patch to remove the masking in the
simple_irq code, please do.

Thanks,

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.23.9-rt12

2007-11-28 Thread Steven Rostedt
We are pleased to announce the 2.6.23.9-rt12 tree, which can be
downloaded from the location:

 http://www.kernel.org/pub/linux/kernel/projects/rt/

Changes since 2.6.23.1-rt11

  - Ported to 2.6.23.9

  - Backported the RT balance code from our submissions to upstream

  - Now missing Gregory's CPU Prio patch (those that are interested
in testing that, should contact Gregory (email above)
and come back with numbers to convince us that this
is worth it).

to build a 2.6.23.9-rt12 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.9.tar.bz2 
  http://www.kernel.org/pub/linux/kernel/projects/rt/patch-2.6.23.9-rt12.bz2

And like always, my RT version of Matt Mackall's ketchup will get this
for you nicely:

  http://people.redhat.com/srostedt/rt/tools/ketchup-0.9.8-rt2

  Note: the -rt2.  Michal Schmidt help point out and fix the fact
that ketchup -s 2.6-rt and ketchup 2.6-rt were broken.
I never used that feature, but now I will.

The broken out patches are also available.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH PREEMPT_RT]: On AT91 ARM: GPIO Interrupt handling can/will stall forever

2007-11-28 Thread Steven Rostedt


On Wed, 28 Nov 2007, Russell King - ARM Linux wrote:
 On Wed, Nov 28, 2007 at 03:38:11PM +0100, Remy Bohmer wrote:
  Hello Daniel,
 
*  Note: The caller is expected to handle the ack, clear, mask and
*  unmask issues if necessary.
   So we shouldn't need any flow control unless there is some other
   factors..
 
  This comment can be misinterpreted, I think. Who is assumed to be the
  caller in this context? The 2 other routines in the driver that
  actually do the unmasking stuff besides only calling this routine? Is
  it allowed to call it directly or should it always be done through a
  wrapper that does all these special things?

 The whole point of this simple handler is to accomodate interrupts such
 as those found on the Neponset board.

 There, you have a status register in a CPLD but no enable/disable
 registers.  The status register tells you whether the SA, ethernet
 or 'USAR' chip asserted its interrupt.

 However, as there is no way to disable the sources, this situation has
 to be handled carefully - the function decoding the interrupt source
 needs to mask and unmask the _parent_ interrupt itself, and it's
 exactly that which the comment is directed towards.

 See neponset_irq_handler().

 The simple IRQ handler is not meant for anything other than that really
 simple implementation.  If people have been using it with interrupts
 which can be individually masked and unmasked, that's where the bug is.
 They should be using one of the other handlers.


Russell,

Thanks for the reply and this nice explanation.

I'm taking this as a NACK.

Daniel or Remy, could you find the offending users and make send patches
to fix them.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH PREEMPT_RT]: On AT91 ARM: GPIO Interrupt handling can/will stall forever

2007-11-28 Thread Steven Rostedt

On Wed, 28 Nov 2007, Russell King - ARM Linux wrote:

 On Wed, Nov 28, 2007 at 02:04:41PM -0500, Steven Rostedt wrote:
  Thanks for the reply and this nice explanation.
 
  I'm taking this as a NACK.
 
  Daniel or Remy, could you find the offending users and make send patches
  to fix them.

 Note that I'm not acking nor nacking the patch; I'm not involved with
 the RT stuff and I've never looked at the code, so I don't know what
 the implications of the patch itself are.

Understood. But I didn't know to pull it in or not. So I used your
explanation to NACK it myself. I don't understand all the intricacies of
the arm architecture. So while Thomas is out, I'm not pulling this in.

If he comes back and gives his ACK, I'll simply NACK my NACK ;-)


 I've merely explained the point of the simple irq handler.

 Maybe the simple irq handler should never have been something that got
 sucked into the generic IRQ stuff, but kept as something specific to
 Neponset.


This could also simply be unique to the interrupt threads (only in RT). So
perhaps the patch is OK.

Remy, sorry about this round-a-bout. But I don't have any of the hardware
that this affects, and I'm just being cautious.

Thanks,

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH PREEMPT_RT]: On AT91 ARM: GPIO Interrupt handling can/will stall forever

2007-11-28 Thread Steven Rostedt


On Wed, 28 Nov 2007, Daniel Walker wrote:


 Ignoring the ARM side of things for a sec, handle_simple_irq() will
 mask() the interrupt in the special case that an interrupt is already in
 the processes of being handled.. handle_simple_irq() also unmasks when
 it finishes handling an interrupt (something real time adds for some
 reason) ..

 In terms of threading the irq everything is the same except there is no
 unmask() call when the thread finishes ..


OK, to be honest, I never fully understood the concept of this
simple_irq. I figured it was because of the ARM architecture.

Your arguments seem reasonable and you are probably correct. But I didn't
write this code, nor do I understand it, and before I go ahead and change
it, I'll wait to hear input from Thomas. Hopefully, he'll be back soon.

Perhaps my confusion about the simple_irq part is from the bug you are
trying to fix. I've been confused by why it was different ;-)

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: rt mutex priority boost

2007-11-28 Thread Steven Rostedt

On Wed, 28 Nov 2007, Peter W. Morreale wrote:

 Will do.  didn't want to flood your mailbox, if that was the case... :-)

No prob with my Inbox ;-)

 
  Well, make does do a lot af IO and syscalls. Accessing the hard drive.
  This in turn will kick off interrupts and softirqs. Which will all contend
  for spinlocks, and since they are all working together, expect a lot of
  contention.
 
  -- Steve
 

 It does, and that was the point.

 Switching gears here a little bit...

 The real problem I see is under a moderate 'dbench' load (No laughing,
 you want VFS contention, use dbench :-) I can easily bump the cs/s
 (context-switch/sec) rate to 380k/s.

Right, and this has nothing to do with priority boosting. It has to do
with lock contention.


 This on a ramfs (no disk involved) partition.  The bad part is that
 top(1) reports 50-60% idle CPU time.  Which implies that 2 of my 4
 x86_64 intels are spinning while there is work to do.

 As an early experiment, I converted the dcache, inode, and vfsmount
 spins to raw, and performance jumped by 4x.  (I realized later that
 dbench does alot of record locking and was still hammered by the BKL,
 otherwise I suspect it would have been significantly greater...)  This
 also reduced the cs/s rate to below 100k/s (from the high of ~380k/s)

After converting those locks to spinlocks, have you tried running
cyclictest and hackbench (or dbench) and see how cyclictest works?

I bet you'll see extremely large latencies.

Those locks are some of the biggest offenders of adding latencies.


 It seems clear that a single point of contention (e.g: the dcache lock
 in the above workload) greatly impacts the throughput of the hardware
 platform.  There are similar points of contention with dev-_xmit_lock,
 and queue_lock in the networking stack.

 Obviously, this is an issue for real-world apps.  Those pesky thingies
 think they need data from various sources to do stuff.  That was humor.

 At the risk of being chastised, is (or has) this any discussion on this
 been taking place?

Discussion of what?  Changing them to raw spinlocks?

The real solution is to find better ways to handle the filesystem with
less contention. This will take great knowledge of the VFS. But this is no
trivial task.

Ideas are welcome.

-- Steve


-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Priorities of IRQ handlers

2007-11-27 Thread Steven Rostedt

On Mon, 26 Nov 2007, John Sigler wrote:

 I need to change the priorities of several soft and hard IRQ handlers.

 Namely,

 o reduce the prio of softirq-timer handler to 10

 o reduce the prio of IRQ14 and IRQ15 handlers to 20
(my flash drives do not support DMA BTW...)

 o boost the prio of my I/O boards' IRQ handlers to 60
(there can be 1 or 2 boards, the driver is a kernel module
 which is loaded after the system has booted.)

 I've written a short program that calls
sched_setscheduler(pid, SCHED_FIFO, param);
 with the appropriate pid, which I look up using ps -ef

Look for the program chrt. It does this for you. This program is
avaliable in all major distrobutions of Linux.


 I need to automate the process of tweaking priorities.

 Can someone offer advice and / or pointers?

Perhaps look at one of the Start up scripts, and add the chrt command
there.

-- Steve
-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: updated ketchup script

2007-11-26 Thread Steven Rostedt

On Thu, 22 Nov 2007, Michal Schmidt wrote:

 Steven,

 Your ketchup script does not download the -rt kernel for me. This patch
 fixes it. Could you update
 http://people.redhat.com/srostedt/rt/tools/ketchup-0.9.8-rt1 ?

 I'm also attaching the complete ketchup script for convenience.


Michal,

Could you explain what exactly went wrong.  Your ketchup script
doesn't work for me. It fails downloads for me.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [BUG on PREEMPT_RT, 2.6.23.1-rt5] in rt-mutex code and signals

2007-11-19 Thread Steven Rostedt

On Mon, 19 Nov 2007, Remy Bohmer wrote:

 Hello Steven,

   OK, I wont be able to work on this this weekend, but I'll try to get to it
   on Monday.  A better example to show the bug you are looking for is simply
   create a mutex and create a thread that grabs that mutex and goes to
   sleep. Have your driver read grab that mutex with
   mutex_lock_interruptible. And if the signal code is broken with this, then
   you definitely got a point that the inerruptible code is broken.

 I removed the 'struct semaphore' completely from my driver, using real
 mutexes now instead, replace the signalling semaphores by 'struct
 completion' mechanisms and got rid of the possible race I saw with the
 completions in a different way, and now the problem is completely
 gone!

 Posix Signals work properly now (no OOPS anymore), so the problem was
 likely related to the way I used the 'struct semaphore' types, which
 is thus different compared to the non-RT kernel and therefor quite
 confusing.

 So, thank you (and Daniel) for pointing me into the right direction.

 Now lets get rid of the 'struct semaphore' completely in the kernel :-))

Remy,

Thanks a lot for looking further into this. I'd like to join the fun in
removing the rest of the semaphores in the kernel, but with you, Ingo,
Daniel and Jon going to do that, one more cook will just spoil the stew.

Once we get rid of all the semaphores in the kernel that are being used as
mutexes, then we can change the code in the -rt patch to keep semaphores
default as compat_semaphores.  And any out of tree driver would just need
to be fixed to use mutexes.

Well, read/write semaphores might have to stay special.

Thanks,

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [BUG on PREEMPT_RT, 2.6.23.1-rt5] in rt-mutex code and signals

2007-11-17 Thread Steven Rostedt


On Sat, 17 Nov 2007, Remy Bohmer wrote:

 Hello Steven,

  The taker of a mutex must also be the one that releases it.  I don't see
  how you could use a mutex for this. It really requires some kind of
  completion, or a compat_semaphore.

 I tried several ways of working around the bug, even tried
 implementing it with kernel threads and protecting global data with
 mutexes. Therefor I know that I have the same problem with mutexes. I
 just created a simple example that showed the problem quickly, this
 does not mean that this is the only case that does not work.

 BTW: I am hacking around in the PREEMPT-RT kernel for years now, I
 know the history very well, and I know what I am doing... Please, do
 not find holes in the example I quickly hacked together, please focus
 on the OOPS message, and help me figuring out what is causing this. I
 can give you other examples of code that shows the same problem. But,
 basically, every call that blocks with _inturruptible() on a rt-mutex
 beneath the surface in the context of a user space process (and
 receives a signal during the block) shows the same problem.

Please don't think that I just took your example to find a hole in it.

Actually, the reason I pointed that out was not because of Oh, this
example is broken, but because of the description of the problem you
are trying to solve. To me, it needs to be done with a completion or
compat_semaphore otherwise it will become very complex in solving.


  Exactly why it should be a completion or compat semaphores. The reason we
  did PI on semaphores is only because they were used as mutexes before Ingo
  pushed to actually get a mutex primative into the kernel. Since then,
  we've been trying to remove all semaphores with either a mutex or
  completion.

 Okay, sounds fair. But: the current implementation does counting the
 number of up's and down's, suggesting that it really behaves like a
 semaphore. It only does some special things during the transition of
 the counter from 0-1 and from 1-0. If this counting is illegal use
 of the mutex mechanism, it should report (compile) errors if:
 * sema_init is used on 'struct semaphore' - init_MUTEX() must be used 
 instead.
 * sema_init should only be used on 'struct compat_semaphore' types.
 * calls to up() and down() in a row should report a BUG message,
 * if up() is called from a different thread than the down() it should
 report a BUG message. Further, the counting up's and down's are not
 allowed on struct semphore types, so it should be removed from the
 code.

It should print out warnings, do you have CONFIG_DEBUG_RT_MUTEXES set?

 * PI should only take place if it is for 100% sure that the 'struct
 semaphore' is used as a mutex. And this is only the case when it is
 initialised with init_MUTEX().

Well, we can't determine that with code ;-)  Remember, there are still
drivers out in the world that use semaphores as mutexes. So the PI
on semaphores is really more of a compatibility issue.


 So, because all these items are not there, I doubt it is really true
 that it is illegal to use 'struct semaphore' types as counting
 semaphores across multiple threads. BESIDES: Everything works fine
 UNTIL a signal is generated during a block on the semaphore. I think
 Ingo tried to make the 'struct semaphore' type to behave like the
 non-RT kernel 'struct semaphore', which actually does NOT show this
 problem wtih my example driver!!!

Right, because a non-RT semaphore _is_ a compat_semaphore. I'm saying if
you see the bug in your driver with the compat_semaphore then lets debug
that. Because that _is_ a bug!



 So, this is a regression if exactly the same driver is used in both
 non-preempt-rt patched kernel and preempt-rt patched kernels.

Not really.  There are things that the preempt-rt kernels require. One, is
that things that need to keep semaphores instead of using them as mutexes,
they should be converted to compat_semaphores.  Perhaps now that we have
mutexes, we can remove the PI on semaphores, and out-of-tree drivers will
need to make sure they don't use semaphores as mutexes anymore.


  down_interruptible(dummy);
  printk(We will block now, and if you press CTRL-C from here, we 
  get an OOPS.\n);
  down_interruptible(dummy);
 
  This double down is actually illegal with rt semaphores. Because we treat
  semaphores as mutexes unless they are declared as compat_semaphores. In
  which case we don't do PI.

 According to code there is a counting mechanism there, which suggest
 that this is allowed to do. It works fine, until a signal arrives.the
 SIGNAL is the only problem here!

Yeah, that code is more of a hack to convert counting semaphores into
mutexes. But semaphores still need to have owners, and they should not
block on themselves. That may be where the bug is.


  Seems that you need to work out how to use a completion for your code. And
  if that doesn't work, then use a compat_semaphore. But beware, that the
  compat_semaphore 

Re: [BUG on PREEMPT_RT, 2.6.23.1-rt5] in rt-mutex code and signals

2007-11-17 Thread Steven Rostedt

On Sat, 17 Nov 2007, Steven Rostedt wrote:
 On Sat, 17 Nov 2007, Remy Bohmer wrote:
  * PI should only take place if it is for 100% sure that the 'struct
  semaphore' is used as a mutex. And this is only the case when it is
  initialised with init_MUTEX().

 Well, we can't determine that with code ;-)  Remember, there are still
 drivers out in the world that use semaphores as mutexes. So the PI
 on semaphores is really more of a compatibility issue.


Oh, I missed your point here. I was up late last night so I'm blaming that
for my _not_so_thorough_ reading ;-)

You are saying that we should change the semaphore to a mutex only if it
was initialized by init_MUTEX(). I see your point.

Actually, if something is initialized by init_MUTEX it really needs to be
a mutex :-)

Right now the compiler determines that something is a mutex or a
semaphore. If we make it a run time option, that will add more complexity
to the code and probably make it less efficient. The more complex part
is something we can deal with. The less efficient part we can not.

find .  -name *.c ! -type d  | xargs grep  init_MUTEX | wc -l
100

Heh, nice number.

Well, I guess I could start sending patches to mainline to convert
semaphores to mutexes. I'm sure that will probably annoy akpm, but I'll do
it one at a time, with lots of thought behind each change.

BTW, drivers breaking with the RT patch is not always considered a
regression.

For example, this is fine in vanilla kernel:

  local_irq_save(flags);
  [...]
  spin_lock(lock);

Where as in the RT patch, that would break. And it would require either
changing the local_irq_save to a local_irq_save_nort, or find a way to do
the spin_lock_irqsave(lock, flags) instead.

-- Steve


-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RT] 2.6.24-rc2-rt1 drivers/dma/ioat_dma.c compile fix

2007-11-16 Thread Steven Rostedt

On Fri, 16 Nov 2007, Nelson, Shannon wrote:
 first-async_tx.phys;
  -   __list_splice(new_chain, ioat_chan-used_desc.prev);
  +   list_splice_tail(new_chain, ioat_chan-used_desc.prev);
 

 NAK.

 These functions do insertions differently.  The 'prev' is pointing to
 the last valid descriptor in the queue and you really want to get the
 new_chain stuck on after this.  Your list_splice_tail() will insert the
 new_chain just before it which will muck up the order of the DMA
 requests.

 You might have more success with
   list_splice_tail(new_chain, ioat_chan-used_desc);
 where used_desc points to the whole list, rather than using the .prev
 pointer to a specific node.

 Please copy me on future ioatdma related comments.


And people wonder why we post RT related patches to LKML. This is exactly
why!

Thanks for the response Shannon!

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [BUG on PREEMPT_RT, 2.6.23.1-rt5] in rt-mutex code and signals

2007-11-16 Thread Steven Rostedt



On Sat, 17 Nov 2007, Remy Bohmer wrote:

 Hello Steven,

 Thanks for your reply

  The above sounds more like you need a completion.
 Funny, I first started with using completion structures, but that did
 not work either. I get similar OOPses on all these kind of locking
 mechanisms, as long as I use the _interruptible() type. I tried every
 work-around I can think of, but none worked :-((
 Even if I block on an ordinary rt-mutex in the same routine, wait a
 _interruptible() type, I get the same problem.

The taker of a mutex must also be the one that releases it.  I don't see
how you could use a mutex for this. It really requires some kind of
completion, or a compat_semaphore.


  What's used to wake up the caller of down_interruptible?
 A call to up() is used from inside an interrupt(thread) context, but
 this is not relevant for the problem, because only blocking on a
 semaphore with down_interruptible() and waking the thread by CTRL-C is
 enough to get this Oops.

 I saw that the code is trying to wake 'other waiters', while there is
 only 1 thread waiting on the semaphore at most. I feel that the root
 cause of this problem has to be searched there.

 I believe that executing any PI code on semaphores is a strange
 behavior anyway, because a semaphore is never 'owned' by a thread, and
 it is always another thread that wakes the thread that blocks on a
 semaphore, and because the waker is unknown, the PI code will always
 boost the prio of the wrong thread.

Exactly why it should be a completion or compat semaphores. The reason we
did PI on semaphores is only because they were used as mutexes before Ingo
pushed to actually get a mutex primative into the kernel. Since then,
we've been trying to remove all semaphores with either a mutex or
completion.


 Strange is also, that I get different behavior on ARM if I use
 sema_init(sema, 1) versus sema_init(sema,0). The latter seems to
 crash less, it will not crash until the first up(); while the first
 will crash even without any up().

 Attached I have put a sample driver I just hacked together a few
 minutes ago. It is NOT the driver that has generates the oops in the
 previous mail, but I have stripped a scull-driver down that much that
 it will be much easier to talk about, and to keep us focussed on the
 part of the code that is causing this.
 Besides: I tested this driver on X86 with 2.6.23.1-rt5 and I get the
 also OOPSes although slightly different than on ARM. See the attached
 dummy.txt file.


 Beware: The up(sema) is NOT required to get this OOPS, I get it even
 without any up(sema) !

 I hope you can look at the attached driver source and help me with this...


down_interruptible(dummy);
printk(We will block now, and if you press CTRL-C from here, we get an 
OOPS.\n);
down_interruptible(dummy);


This double down is actually illegal with rt semaphores. Because we treat
semaphores as mutexes unless they are declared as compat_semaphores. In
which case we don't do PI.

Seems that you need to work out how to use a completion for your code. And
if that doesn't work, then use a compat_semaphore. But beware, that the
compat_semaphore can cause unbounded latencies. But then again, so can
completions.


-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: cyclic test results on 8 way (2.6.23.1-rt7 through 2.6.23.1-rt11)

2007-11-13 Thread Steven Rostedt


On Tue, 13 Nov 2007, Jaswinder Singh wrote:

 On Nov 8, 2007 1:04 PM, Darren Hart [EMAIL PROTECTED] wrote:
  # ./cyclictest -n -i 1 -l 1 -p 95

 1 (10 milliseconds) interval seems to be quite big for current
 machine. 10 milliseconds is good for 10 to 15 years old machine but
 not for latest machines.

 I think we should try -i 1000 or -i 4000 .


heh, I test with -i 250.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: cyclic test results on 8 way (2.6.23.1-rt7 through 2.6.23.1-rt11)

2007-11-13 Thread Steven Rostedt

On Tue, 13 Nov 2007, Jaswinder Singh wrote:

 Hello Greg and Darren,

 If possible, can you please also add processor ID/number in your
 cyclictest and run more tasks (say 40, 60 or 80 tasks) then we have
 good picture like how many processes are running in which processor
 and how they switching between processors.

 And can we force processes to run on specific processor or core.


Cyclic test isn't really about testing RT balancing. Even though it does
have an effect. For that type of test you want my rt-migrate-test.c
http://rostedt.homelinux.com/rt/rt-migrate-test.c
or Gregory's (when he fixes it ;-)
http://rt.wiki.kernel.org/index.php/Preemption_Test

-- Steve
-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: cyclic test results on 8 way (2.6.23.1-rt7 through 2.6.23.1-rt11)

2007-11-13 Thread Steven Rostedt


On Tue, 13 Nov 2007, Darren Hart wrote:

 On Tuesday 13 November 2007 06:15:03 Steven Rostedt wrote:
  On Tue, 13 Nov 2007, Jaswinder Singh wrote:
   On Nov 8, 2007 1:04 PM, Darren Hart [EMAIL PROTECTED] wrote:
# ./cyclictest -n -i 1 -l 1 -p 95
  
   1 (10 milliseconds) interval seems to be quite big for current
   machine. 10 milliseconds is good for 10 to 15 years old machine but
   not for latest machines.
  
   I think we should try -i 1000 or -i 4000 .
 
  heh, I test with -i 250.

 Someone, I'm sorry I can't recall who atm, suggested that using a larger
 interval would allow for more variance to be introduced - not keeping the
 caches so hot for this particular test by not spending so much time on the
 cpu.  Is this a valid approach?  Perhaps running multiple runs with both very
 tight intervals (like Steve's case) and some longer intervals to ensure we
 can handle both cases - since both are common in practice.


I don't think it would hurt to test both cases. Perhaps three kinds.

  -i 250
  -i 1000
  -i 1

do them separately, and that should give us a good idea of running fast as
well as cache cold.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23.1-rt9 (and others)

2007-11-07 Thread Steven Rostedt

On Wed, 7 Nov 2007, Dragan Noveski wrote:

 Gregory Haskins wrote:
  Doh!  Your error made me realize that I broke uniprocessor in -rt9.  Will 
  fix right away.
 
  As far as -rt7 is concerned, that doesn't make a lot of sense since cpupri 
  isnt introduced until -rt9.  Perhaps your tree was dirtied from a previous 
  application of -rt9?  Let me know if that doesn't appear to be the case.
 
  Regards,
  -Greg
 
 
 
 sorry, i am not sure if i did not done some missmatch by copying the
 config file into the tree, but i am always doing 'rm -r', and unpacking
 the tree before compiling.
 i tried again the rt9 (100% without missmatching) but it does not work
 i ll give a try with the rt6 now.

 very much thanks for the support and cheers,


-rt6 is broken.

I'd recommend doing the following:

 wget -O /usr/local/bin/ketchup 
http://people.redhat.com/srostedt/rt/tools/ketchup-0.9.8-rt1

 mkdir tmp
 cd tmp
 ketchup -r -G 2.6.23.1-rt7

This will get you the 2.6.23.1-rt7 kernel and rename the tmp directory
to linux-2.6.23.1-rt7

After you compiled and install -rt7 while in the same directory you can do

 ketchup -r -G 2.6.23.1-rt10

and it will update that kernel tree to 2.6.23.1-rt10 and again rename that
directory. (2.6.23.1-rt10 which BTW has the compile fix).


-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23.1-rt9 (and others)

2007-11-07 Thread Steven Rostedt

On Wed, 7 Nov 2007, Steven Rostedt wrote:

 This is a special announcement for the latest -rt patches. This is
 actually announcing more than one tree (pay close attention to the
 differences between -rt7, -rt8 and -rt9).


[...]

   2.6.23.1-rt9

- RT balancing by CPU priorities (Gregory Haskins)

-rt9 has been found to break UP compilation, so -rt10 has been released
with the fix.



 Now benchmarks against 2.6.23.1-rt7 -rt8 and -rt9 would be greatly
 appreciated.  These three are all present in

So please compare -rt7, -rt8 and -rt10.

-- Steve


   http://www.kernel.org/pub/linux/kernel/projects/rt/

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.23.1-rt9 (and others)

2007-11-06 Thread Steven Rostedt
This is a special announcement for the latest -rt patches. This is
actually announcing more than one tree (pay close attention to the
differences between -rt7, -rt8 and -rt9).

  2.6.23.1-rt6

   - Removed BUG_ON in exit (Steven Rostedt and Daniel Walker)

   -  Turn RCU preempt boost on by default (Steven Rostedt)
  (for when RCU PREEMPT is enabled)

   - Fixes for PowerPC (Paul McKenney)


  2.6.23.1-rt7

   - Found that there's a flaw in the PowerPC patch so
 it was pulled from the tree.

  2.6.23.1-rt8

   - More aggressive RT Balancing (Gregory Haskins)

  2.6.23.1-rt9

   - RT balancing by CPU priorities (Gregory Haskins)


Now benchmarks against 2.6.23.1-rt7 -rt8 and -rt9 would be greatly
appreciated.  These three are all present in

  http://www.kernel.org/pub/linux/kernel/projects/rt/

Gregory and I have been having disagreements on how to solve RT task
balancing among CPUS. Although we shared ideas back and forth, and both
our methods have been greatly influenced by each other, the real answer
comes from actual numbers. So these three versions are posted for your
convenience to see which actually do the best. I would be happy to tell
Gregory he's right, if the numbers prove it.

Currently, what we do to test RT latencies is to run Thomas Gleixner's
cyclictest
(http://git.kernel.org/?p=linux/kernel/git/tglx/rt-tests.git;a=summary)
as well as hackbench, to see what the maximum latencies we get are.

Other tests are welcomed too.


to build a 2.6.23.1-rt7 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.1.tar.bz2 
  http://www.kernel.org/pub/linux/kernel/projects/rt/patch-2.6.23.1-rt7.bz2

to build a 2.6.23.1-rt8 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.1.tar.bz2 
  http://www.kernel.org/pub/linux/kernel/projects/rt/patch-2.6.23.1-rt8.bz2

to build a 2.6.23.1-rt9 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.1.tar.bz2 
  http://www.kernel.org/pub/linux/kernel/projects/rt/patch-2.6.23.1-rt9.bz2


And like always, my RT version of Matt Mackall's ketchup will get this
for you nicely:

  http://people.redhat.com/srostedt/rt/tools/ketchup-0.9.8-rt1

The broken out patches are also available.

Thanks!

-- Steve


-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: old rt patches

2007-11-01 Thread Steven Rostedt

--
On Thu, 1 Nov 2007, Jaswinder Singh wrote:

 Steve,

 Thanks for your ketchup.

 Please let me know:

 1. Is 2.6.13-rt1 is first rt version

Not by a long shot.  The earliest hint of a RT patch I have is a left over
vi swap file:

   .realtime-preempt-2.6.10-rc1-mm3-V0.7.25-1.swp

I think I started with Ingo's work in 2.6.9, so I don't even know what the
first version I used was.


 2. how to read text with black background as in
 http://rostedt.homelinux.com/
 do I need to learn some martial arts.

No you need to see www.kihontech.com
 (my own business - not active at the moment)

-- Steve
-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23.1-rt5 rtc lost interrupt

2007-10-31 Thread Steven Rostedt


--
When I'm asked what language is my mother tongue,
I simply answer C.

On Wed, 31 Oct 2007, Dragan Noveski wrote:

 hallo list, yesterday strange thing happened here.
 i am running 2.6.23.1-rt5, and actually it is running good, but
 yesterday partially my keyboard stopped working. usually if i press any
 key (inside of xterm) and do not release, but keep pressed, the
 letter/sign will be printed all the time again and again as long as i
 release the key.
 something like:

 a

What X drivers are you running (hopefully not nVidia or some other
proprietary module).


 but yesterday, if i was holding the key, the letter got displayed on the
 screen just one time.
 if i released and pressed again, one more time.
 so i was not able to press only one time a key and hold to get sth like
 these:

 

 than in /var/log/system i found this lines:


 Oct 30 21:22:37 murija2 kernel: rtc: lost 8 interrupts
 Oct 30 21:22:50 murija2 kernel: rtc: lost 4 interrupts
 Oct 30 21:22:50 murija2 kernel: rtc: lost 20 interrupts

Unfortunately, this gives me no clue to what went wrong. Perhaps, you have
a high priority process running somewhere that is keeping things from
running.

What type of system is this? SMP?


 this also appeared in dmesg.

 is this explained so that someone is able to understand?
 does any one has an idea what went wrong here?

 after a reboot the keyboard was working as usually again and the issue
 did not happened again since them.

 hope to get some suggestions/answers?!


I can't figure out much with the current info.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: CONFIG_CRITICAL_IRQSOFF_TIMING

2007-10-31 Thread Steven Rostedt

--
  [...]
 
   CONFIG_INTERRUPT_OFF_HIST=y
   CONFIG_CRITICAL_LATENCY_HIST=y
   CONFIG_LATENCY_HIST=y
 
  Turn off all histograms. They will kill any measurements you are taking.
  And yes, you will not get the output from latency_trace that you are
  after.

 Steven and Daniel, I tried your suggestion with the following resulsts:
 CRITICAL_IRQSOFF_TIMING: preempt_max_latency remains 0 even under load,
  cyclictest -b is able to store a trace in 
 /proc/latency_trace
 CRITICAL_PREEMPT_TIMING: kernel won´t start at all (vmlinuz is read from tftp 
 server but nothing afterwards)
 WAKEUP_TIMING:   is working as expected.

 So switching off the histograms is an improvement. For the moment, I can live 
 with this situation
 (especially the cyclictest -b option is very useful),
 but I wonder if anyone else has the same problems?

critcal irqs off timings wont start with wakeup timing on. You need to
disable that. But that you can do from the command line.

echo 0  /proc/sys/kernel/wakeup_timing

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -rt] remove in_interrupt() BUG_ON in exit path

2007-10-30 Thread Steven Rostedt

--
On Tue, 30 Oct 2007, Daniel Walker wrote:

 As you can see from the patch there is already a case to catch this type
 of problem. This BUG_ON() adds a resursive OOPS , since BUG() causes an OOPS
 and the OOPS just calls do_exit() , and do_exit() just causes another OOPS 
 with
 this BUG_ON().. If we call panic() here then it's clear what the problem was,
 instead of causing an endless recursive crash.

 Signed-off-by: Daniel Walker [EMAIL PROTECTED]

 ---
  kernel/exit.c |1 -
  1 file changed, 1 deletion(-)

 Index: linux-2.6.23.1/kernel/exit.c
 ===
 --- linux-2.6.23.1.orig/kernel/exit.c
 +++ linux-2.6.23.1/kernel/exit.c
 @@ -895,7 +895,6 @@ fastcall NORET_TYPE void do_exit(long co

   WARN_ON(atomic_read(tsk-fs_excl));

 - BUG_ON(in_interrupt());
   if (unlikely(in_interrupt()))
   panic(Aiee, killing interrupt handler!);
   if (unlikely(!tsk-pid))

I did this change once before, while debugging. I had the same issue. This
BUG_ON was giving me recursive crashes that prevented me knowing WTF was
going on.  I thought I even submitted a patch to remove it. Perhaps I
forgot to. Nope, I did!

http://www.ussg.iu.edu/hypermail/linux/kernel/0707.0/1804.html

Since the change is added by the preempt-realtime-core.patch, I'll just
remove it from there.

IOW, I'll fold this into that patch.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: CONFIG_CRITICAL_IRQSOFF_TIMING

2007-10-30 Thread Steven Rostedt

I somehow was removed from the CC list, so I didn't see it.

rantI hate lists that think it's best to only post to the list and tell
you to take all people off the CC list. Those are lists that are run by
people that have way too much time on their hands to be reading mailing
lists all day. Between LKML, linux-rt-users, virt, lguest, and a ton of
internal lists,  I can't keep up! When in a coding frenzy, I may
not read lists for days./rant


--
On Tue, 30 Oct 2007, Gerrit Binnenmars wrote:

  Datum: 29/10/07 02:09 PM
  Van: Steven Rostedt
  Aan: Gerrit Binnenmars
  CC: linux-rt-users@vger.kernel.org
  Onderwerp : Re: CONFIG_CRITICAL_IRQSOFF_TIMING
 
 
  --
  On Sun, 28 Oct 2007, Gerrit Binnenmars wrote:
 
   Hello,
  
   Can someone send me a .config file that shows the options needed to get 
   the interrupt off
   timing values in preempt_max_latency, please. I only see a value of 0 in 
   this file
   I am using kernel 2.6.22.1 with the corresponding rt patch.
  
   Thanks in advance,
 
  Make sure you have event trace and latency tracing as well as
  Interrupts-off critical section latency timing.
 
  Then you need to echo 0  /proc/sys/kernel/preempt_max_latency.
 
 Hello Steve,

 Thanks for your quick response, but the preempt_max_latency remains 0. Also 
 the latency_hist shows no samples taken.
 Can you or someone else have a look at the corresponding CONFIG options?

 CONFIG_LTT_PROBE_LOCKING=m
 # CONFIG_LTT_PROBE_STACK is not set

 #
 # Kernel hacking
 #

[...]

 CONFIG_INTERRUPT_OFF_HIST=y
 CONFIG_CRITICAL_LATENCY_HIST=y
 CONFIG_LATENCY_HIST=y

Turn off all histograms. They will kill any measurements you are taking.
And yes, you will not get the output from latency_trace that you are
after.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: CONFIG_CRITICAL_IRQSOFF_TIMING

2007-10-29 Thread Steven Rostedt

--
On Sun, 28 Oct 2007, Gerrit Binnenmars wrote:

 Hello,

 Can someone send me a .config file that shows the options needed to get the 
 interrupt off
 timing values in preempt_max_latency, please. I only see a value of 0 in this 
 file
 I am using kernel 2.6.22.1 with the corresponding rt patch.

 Thanks in advance,

Make sure you have event trace and latency tracing as well as
Interrupts-off critical section latency timing.

Then you need to echo 0  /proc/sys/kernel/preempt_max_latency.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.23.1-rt5

2007-10-29 Thread Steven Rostedt
We are pleased to announce the 2.6.23.1-rt5 tree, which can be
downloaded from the location:

 http://www.kernel.org/pub/linux/kernel/projects/rt/

Changes since 2.6.23-rt4

  - Compile for UP (Discovered by Dragan Noveski)

  - ntfs local_irq_save_nort (Mike Galbraith)

  - Don't disable preemption on handlers without IST (Andi Kleen)

  - Minor cleanups (Steven Rostedt)

to build a 2.6.23.1-rt5 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.1.tar.bz2 
  http://www.kernel.org/pub/linux/kernel/projects/rt/patch-2.6.23.1-rt5.bz2

The broken out patches are also available.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [2.6.23-rt3] NMI watchdog trace of deadlock

2007-10-27 Thread Steven Rostedt

--
On Sat, 27 Oct 2007, Mike Galbraith wrote:

 Hm.  Looking at the change to mm/bounce.c, perhaps I should do this
 instead?

 --- ./fs/ntfs/aops.c.org  2007-10-27 10:16:40.0 +0200
 +++ ./fs/ntfs/aops.c  2007-10-27 12:11:07.0 +0200
 @@ -139,13 +139,13 @@ static void ntfs_end_buffer_async_read(s
   recs = PAGE_CACHE_SIZE / rec_size;
   /* Should have been verified before we got here... */
   BUG_ON(!recs);
 - local_irq_save(flags);
 + local_irq_save_nort(flags);
   kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
   for (i = 0; i  recs; i++)
   post_read_mst_fixup((NTFS_RECORD*)(kaddr +
   i * rec_size), rec_size);
   kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
 - local_irq_restore(flags);
 + local_irq_restore_nort(flags);
   flush_dcache_page(page);
   if (likely(page_uptodate  !PageError(page)))
   SetPageUptodate(page);


Thanks, applied.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Don't disable preemption in exception handlers without IST

2007-10-27 Thread Steven Rostedt


--
On Sat, 27 Oct 2007, Andi Kleen wrote:


 Some of the exception handlers that run on an IST in a normal kernel
 still disable preemption. This causes might_sleep warning when sending signals
 for debugging in PREEMPT-RT because sending signals can take a lock.
 Since the ISTs are disabled now for those don't disable the preemption.

 This completes the remove IST patch I sent some time ago and fixes
 another case where using gdb caused warnings.

 Also it will likely improve latency a little bit.

 Signed-off-by: Andi Kleen [EMAIL PROTECTED]

Thanks, applied.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23-rt4 (was 2.6.23-rt1 trouble)

2007-10-27 Thread Steven Rostedt
--

On Sat, 27 Oct 2007, Rui Nuno Capela wrote:

  On Mon, October 15, 2007 11:49, Rui Nuno Capela wrote:
 
  I am experiencing some highly annoying but intermitent freezing on a
  pentium4 2.80G HT/SMT box, when doing normal desktop work with 2.6.23-rt1.
 
 
  The same crippling behavior does not occur on a Core 2 Due T7200 2.0G
  SMP, so I suspect it's something due specific to the SMT scheduling
  support (Hyper-Threading). But can't tell for sure, obviously :)
 
 
  I was wrong. After several trials the same behavior also occurs on the
  Core2 Duo T7200. It just took longer to show its nasty.
 
 
  The symptoms are noticeable primarily as some X/GUI intermitent freezing,
  sometimes only one application, then several and ultimately the whole X
  desktop becomes completely unresponsive. It looks like scheduling

When things start to freeze, could you capture the output of a sysrq-t.


  problems. There is this hint that switching to a spare console terminal
  (via Ctrl+Alt+Fn) might cause later recovery. But its just a question of
  some more time for it just happens again and again, one after another,
  several applications becoming temporarily frozen and just by luck the
  system gets back to normal, probably due to some incidental shake-up :)
  but there are other times that nothing seems to help with no alternative
  to the power-reset switch.
 
  I could not find any evidence on dmesg or in the system logs, of any
  apparent trouble. No BUGs, no oops, no panics, no nothing. It just
  freezes, this and that, now and then. It just makes it all unworkable
  and obviously subject to ditching.
 
  Again, this only happens on this P4/HT box. On a Core2 Duo laptop, with
  same 2.6.23-rt1 with the very same kernel configuration, it does not show
  any illness and is running quite fine.
 
 
  False. It used to run fine, until the creeps happen first time :(
 
 
  Remember one report I had about a similar freezing behavior? Now it's
  happening the other way around: the core2 is OK, the pentium4 is KO.
 
 
  Now it applies to all 2.6.23-rt1 images I could test upon.
 
 
  One naive suspicion goes like the new rcu-preempt code is to blame, since
  I don't remember having this or any other trouble with 2.6.23-rc8-rt1.
 
 
  Not be sure anymore, but this seems to be still a valid assumption.
 

 just to let you know that still the same trouble persists with 2.6.23.1-rt4

 .config can be found here:
http://www.rncbc.org/datahub/config-2.6.23.1-rt4.0


I have a P4HT laptop (unfortunately with no serial). I use it as one of my
main machines, so it will suck for me when it freezes ;-). I'll take your
config and try it out.

I'll most likely do this on Monday since process Wife has the highest
priority over the weekend ;-)

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Possible bug in 23rt3 preempt-irqs-core.patch

2007-10-26 Thread Steven Rostedt

--
On Thu, 25 Oct 2007, Paul Gortmaker wrote:

 I was looking over the patches in preempt-irqs-core.patch in the broken
 out 23rt3 series and came across this chunk:

 ---
 @@ -325,6 +349,11 @@ int setup_irq(unsigned int irq, struct i
 if (!shared) {
 irq_chip_set_defaults(desc-chip);

 +   /*
 +* Propagate any possible IRQF_NODELAY flag into IRQ_NODELAY:
 +*/
 +   recalculate_desc_flags(desc);
 +
  #if defined(CONFIG_IRQ_PER_CPU)
 if (new-flags  IRQF_PERCPU)
 desc-status |= IRQ_PER_CPU;
 ---

 Note the recalculate is actually contained within the if (!shared)
 even though at a casual glance the indentation suggests otherwise.

 Looking at older versions of the broken out patches leads me to believe
 the if clause should be after the recalculate.  I've attached a patch to
 do just that, but if you'd rather I just respin the preempt-irqs-core.patch
 then I could just as easily do that as well.


Grumble,  That's what I get for keeping fuzzy patching turned on in quilt
:-(


 --- linux-2.6.23-rt3/kernel/irq/manage.c~
 +++ linux-2.6.23-rt3/kernel/irq/manage.c
 @@ -354,14 +354,14 @@
   if (new-flags  IRQF_NOBALANCING)
   desc-status |= IRQ_NO_BALANCING;

 - if (!shared) {
 - irq_chip_set_defaults(desc-chip);
 -
   /*
* Propagate any possible IRQF_NODELAY flag into IRQ_NODELAY:
*/
   recalculate_desc_flags(desc);

 + if (!shared) {
 + irq_chip_set_defaults(desc-chip);
 +
  #if defined(CONFIG_IRQ_PER_CPU)
   if (new-flags  IRQF_PERCPU)
   desc-status |= IRQ_PER_CPU;


Thanks,

Applied!

-- Steve

P.S. I have since turned off fuzzy logic for updating -rt (I may need to
go back and reapply old patches to see what else broke :-()


-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] RT: Cache cpus_allowed weight for optimizing migration

2007-10-26 Thread Steven Rostedt

--
On Fri, 26 Oct 2007, Gregory Haskins wrote:

 This version has the feedback from Steve's review incorporated

 -

 RT: Cache cpus_allowed weight for optimizing migration

 Some RT tasks (particularly kthreads) are bound to one specific CPU.
 It is fairly common for one or more bound tasks to get queued up at the
 same time.  Consider, for instance, softirq_timer and softirq_sched.  A
 timer goes off in an ISR which schedules softirq_thread to run at RT50.
 Then during the handling of the timer, the system determines that it's
 time to smp-rebalance the system so it schedules softirq_sched to run
 from within the softirq_timer kthread context. So we are in a situation
 where we have two RT50 tasks queued, and the system will go into
 rt-overload condition to request other CPUs for help.

 The problem is that these tasks cannot ever be pulled away since they
 are already running on their one and only valid RQ.  However, the other
 CPUs cannot determine that the tasks are unpullable without going
 through expensive checks/locking.  Therefore the helping CPUS

A little exageration there ;-)  We don't need to go through locking (at
least my code doesn't), and the checks are not that expensive (could cause
some unneeded cache bouncing though).

 experience unecessary overhead/latencies regardless as they
 ineffectively try to process the overload condition.

 This patch tries to optimize the situation by utilizing the hamming
 weight of the task-cpus_allowed mask.  A weight of 1 indicates that
 the task cannot be migrated, which may be utilized by the overload

s/may/will/

 handling code to eliminate uncessary rebalance attempts.  We also
 introduce a per-rq variable to count the number of migratable tasks
 that are currently running.  We only go into overload if we have more
 than one rt task, AND at least one of them is migratable.

 Calculating the weight is probably relatively expensive, so it is only
 done when the cpus_allowed mask is updated (which should be relatively
 infrequent, especially compared to scheduling frequency) and cached in
 the task_struct.


 Signed-off-by: Gregory Haskins [EMAIL PROTECTED]
 ---

  include/linux/sched.h |2 ++
  kernel/fork.c |1 +
  kernel/sched.c|9 +++-
  kernel/sched_rt.c |   58 
 +
  4 files changed, 64 insertions(+), 6 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index 7a3829f..829de6f 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -1048,6 +1048,7 @@ struct sched_class {
   void (*set_curr_task) (struct rq *rq);
   void (*task_tick) (struct rq *rq, struct task_struct *p);
   void (*task_new) (struct rq *rq, struct task_struct *p);
 + void (*set_cpus_allowed)(struct task_struct *p, cpumask_t newmask);
  };

  struct load_weight {
 @@ -1144,6 +1145,7 @@ struct task_struct {

   unsigned int policy;
   cpumask_t cpus_allowed;
 + int nr_cpus_allowed;
   unsigned int time_slice;

  #ifdef CONFIG_PREEMPT_RCU
 diff --git a/kernel/fork.c b/kernel/fork.c
 index 5f11f23..f808e18 100644
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -1257,6 +1257,7 @@ static struct task_struct *copy_process(unsigned long 
 clone_flags,
*/
   preempt_disable();
   p-cpus_allowed = current-cpus_allowed;
 + p-nr_cpus_allowed = current-nr_cpus_allowed;
   if (unlikely(!cpu_isset(task_cpu(p), p-cpus_allowed) ||
   !cpu_online(task_cpu(p
   set_task_cpu(p, smp_processor_id());
 diff --git a/kernel/sched.c b/kernel/sched.c
 index 30fa531..6c90093 100644
 --- a/kernel/sched.c
 +++ b/kernel/sched.c
 @@ -262,6 +262,7 @@ struct rt_rq {
   int rt_load_balance_idx;
   struct list_head *rt_load_balance_head, *rt_load_balance_curr;
   unsigned long rt_nr_running;
 + unsigned long rt_nr_migratory;
   unsigned long rt_nr_uninterruptible;
   /* highest queued rt task prio */
   int highest_prio;
 @@ -5371,7 +5372,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t 
 new_mask)
   goto out;
   }

 - p-cpus_allowed = new_mask;
 + if (p-sched_class-set_cpus_allowed)
 + p-sched_class-set_cpus_allowed(p, new_mask);
 + else {
 + p-cpus_allowed= new_mask;
 + p-nr_cpus_allowed = cpus_weight(new_mask);
 + }
 +
   /* Can the task run on the task's current CPU? If so, we're done */
   if (cpu_isset(task_cpu(p), new_mask))
   goto out;
 diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
 index b59dc20..6d293bd 100644
 --- a/kernel/sched_rt.c
 +++ b/kernel/sched_rt.c
 @@ -50,6 +50,24 @@ static inline void update_curr_rt(struct rq *rq)
   curr-se.sum_exec_runtime += delta_exec;
   curr-se.exec_start = rq-clock;
  }
 +#ifdef CONFIG_SMP
 +static void inc_rt_migration(struct task_struct *p, struct rq *rq)
 +{
 + 

Re: [PATCH] RT: Cache cpus_allowed weight for optimizing migration

2007-10-26 Thread Steven Rostedt
--
On Fri, 26 Oct 2007, Gregory Haskins wrote:

 On Fri, 2007-10-26 at 10:47 -0400, Steven Rostedt wrote:
  --
  On Fri, 26 Oct 2007, Gregory Haskins wrote:
 
   This version has the feedback from Steve's review incorporated
  
   -
  
   RT: Cache cpus_allowed weight for optimizing migration
  
   Some RT tasks (particularly kthreads) are bound to one specific CPU.
   It is fairly common for one or more bound tasks to get queued up at the
   same time.  Consider, for instance, softirq_timer and softirq_sched.  A
   timer goes off in an ISR which schedules softirq_thread to run at RT50.
   Then during the handling of the timer, the system determines that it's
   time to smp-rebalance the system so it schedules softirq_sched to run
   from within the softirq_timer kthread context. So we are in a situation
   where we have two RT50 tasks queued, and the system will go into
   rt-overload condition to request other CPUs for help.
  
   The problem is that these tasks cannot ever be pulled away since they
   are already running on their one and only valid RQ.  However, the other
   CPUs cannot determine that the tasks are unpullable without going
   through expensive checks/locking.  Therefore the helping CPUS
 
  A little exageration there ;-)  We don't need to go through locking (at
  least my code doesn't), and the checks are not that expensive (could cause
  some unneeded cache bouncing though).

 Well, its all relative.  Not going into overload to begin with, and
 therefore not hitting the pull_rt_tasks() logic outright should be
 faster than lockless scanning tricks ;)

 But I digress.  I don't think the original code had your optimization,
 so you are right:  It's perhaps a bit of an exaggeration in HEAD.  I'll
 be less dramatic for the next drop ;)


I'll digress some more ;-)

Actually, what I want to stress is that the greatest benefit of this patch
is not removing the unneed checks to processes that can't migrate (it is a
benefit though), but the real benefit that I'm looking forward to is the
migrating of lower priority tasks that can migrate. And these tasks wont
be affected by tasks that can't migrate.

Looking forward to your next patch.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Possible bug in 23rt3 preempt-irqs-core.patch

2007-10-26 Thread Steven Rostedt

--
On Fri, 26 Oct 2007, Steven Rostedt wrote:


 --
 On Thu, 25 Oct 2007, Paul Gortmaker wrote:

  I was looking over the patches in preempt-irqs-core.patch in the broken
  out 23rt3 series and came across this chunk:
 
  ---
  @@ -325,6 +349,11 @@ int setup_irq(unsigned int irq, struct i
  if (!shared) {
  irq_chip_set_defaults(desc-chip);
 
  +   /*
  +* Propagate any possible IRQF_NODELAY flag into IRQ_NODELAY:
  +*/
  +   recalculate_desc_flags(desc);
  +
   #if defined(CONFIG_IRQ_PER_CPU)
  if (new-flags  IRQF_PERCPU)
  desc-status |= IRQ_PER_CPU;
  ---
 
  Note the recalculate is actually contained within the if (!shared)
  even though at a casual glance the indentation suggests otherwise.
 
  Looking at older versions of the broken out patches leads me to believe
  the if clause should be after the recalculate.  I've attached a patch to
  do just that, but if you'd rather I just respin the preempt-irqs-core.patch
  then I could just as easily do that as well.
 

 Grumble,  That's what I get for keeping fuzzy patching turned on in quilt
 :-(


Looking at the patch set I started with, the bug exists there too. I just
did a full -F0 rework of what I started with and didn't find anything else
that could have been caused by fuzzy logic. Seems this bug crept in before
I took over as patch monkey.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] RT: Cache cpus_allowed weight for optimizing migration

2007-10-26 Thread Steven Rostedt

I forgot to tell you this the last time. You need to add:

---
 include/linux/init_task.h |1 +
 1 file changed, 1 insertion(+)

Index: linux-2.6.23-rt3/include/linux/init_task.h
===
--- linux-2.6.23-rt3.orig/include/linux/init_task.h
+++ linux-2.6.23-rt3/include/linux/init_task.h
@@ -142,6 +142,7 @@ extern struct group_info init_groups;
.policy = SCHED_NORMAL, \
INIT_RCU_BOOST_PRIO \
.cpus_allowed   = CPU_MASK_ALL, \
+   .nr_cpus_allowed = NR_CPUS, \
.mm = NULL, \
.active_mm  = init_mm, \
.run_list   = LIST_HEAD_INIT(tsk.run_list), \

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.23.1-rt4

2007-10-26 Thread Steven Rostedt
We are pleased to announce the 2.6.23.1-rt4 tree, which can be
downloaded from the location:

 http://www.kernel.org/pub/linux/kernel/projects/rt/

Changes since 2.6.23-rt3

  - Updated to stable release 2.6.23.1

  - Added latest High Resolution timers work

  - RT Balance CPU weight optimization (Gregory Haskins)

  - Limit to NUMA Node RT Balance (Steven Rostedt)

  - Workqueue PI sched.h compile fix (Clark Williams)

  - plist debug init fix (Jan Kiszka)

  - Kconfig updates (Carsten Emde)

  - IRQ flags recalculation fix (Paul Gortmaker)

to build a 2.6.23-rt3 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.1.tar.bz2
  http://www.kernel.org/pub/linux/kernel/projects/rt/patch-2.6.23.1-rt4.bz2

The broken out patches are also available.

-- Steve


-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] RT: Cache cpus_allowed weight for optimizing migration

2007-10-25 Thread Steven Rostedt

--
On Thu, 25 Oct 2007, Gregory Haskins wrote:

 Some RT tasks (particularly kthreads) are bound to one specific CPU.
 It is fairly common for one or more bound tasks to get queued up at the
 same time.  Consider, for instance, softirq_timer and softirq_sched.  A
 timer goes off in an ISR which schedules softirq_thread to run at RT50.
 Then during the handling of the timer, the system determines that it's
 time to smp-rebalance the system so it schedules softirq_sched to run
 from within the softirq_timer kthread context. So we are in a situation
 where we have two RT50 tasks queued, and the system will go into
 rt-overload condition to request other CPUs for help.

 The problem is that these tasks cannot ever be pulled away since they
 are already running on their one and only valid RQ.  However, the other
 CPUs cannot determine that the tasks are unpullable without going
 through expensive checks/locking.  Therefore the helping CPUS
 experience unecessary overhead/latencies regardless as they
 ineffectively try to process the overload condition.

 This patch tries to optimize the situation by utilizing the hamming
 weight of the task-cpus_allowed mask.  A weight of 1 indicates that
 the task cannot be migrated, which may be utilized by the overload
 handling code to eliminate uncessary rebalance attempts.  We also
 introduce a per-rq variable to count the number of migratable tasks
 that are currently running.  We only go into overload if we have more
 than one rt task, AND at least one of them is migratable.

 Calculating the weight is probably relatively expensive, so it is only
 done when the cpus_allowed mask is updated (which should be relatively
 infrequent, especially compared to scheduling frequency) and cached in
 the task_struct.


 Signed-off-by: Gregory Haskins [EMAIL PROTECTED]
 ---

  include/linux/sched.h |2 +
  kernel/fork.c |1
  kernel/sched.c|9 +++-
  kernel/sched_rt.c |  116 
 +
  4 files changed, 107 insertions(+), 21 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index 7a3829f..829de6f 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -1048,6 +1048,7 @@ struct sched_class {
   void (*set_curr_task) (struct rq *rq);
   void (*task_tick) (struct rq *rq, struct task_struct *p);
   void (*task_new) (struct rq *rq, struct task_struct *p);
 + void (*set_cpus_allowed)(struct task_struct *p, cpumask_t newmask);
  };

  struct load_weight {
 @@ -1144,6 +1145,7 @@ struct task_struct {

   unsigned int policy;
   cpumask_t cpus_allowed;
 + int nr_cpus_allowed;
   unsigned int time_slice;

  #ifdef CONFIG_PREEMPT_RCU
 diff --git a/kernel/fork.c b/kernel/fork.c
 index 5f11f23..f808e18 100644
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -1257,6 +1257,7 @@ static struct task_struct *copy_process(unsigned long 
 clone_flags,
*/
   preempt_disable();
   p-cpus_allowed = current-cpus_allowed;
 + p-nr_cpus_allowed = current-nr_cpus_allowed;
   if (unlikely(!cpu_isset(task_cpu(p), p-cpus_allowed) ||
   !cpu_online(task_cpu(p
   set_task_cpu(p, smp_processor_id());
 diff --git a/kernel/sched.c b/kernel/sched.c
 index 30fa531..6c90093 100644
 --- a/kernel/sched.c
 +++ b/kernel/sched.c
 @@ -262,6 +262,7 @@ struct rt_rq {
   int rt_load_balance_idx;
   struct list_head *rt_load_balance_head, *rt_load_balance_curr;
   unsigned long rt_nr_running;
 + unsigned long rt_nr_migratory;
   unsigned long rt_nr_uninterruptible;
   /* highest queued rt task prio */
   int highest_prio;
 @@ -5371,7 +5372,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t 
 new_mask)
   goto out;
   }

 - p-cpus_allowed = new_mask;
 + if (p-sched_class-set_cpus_allowed)

Not sure we need this optimization (not doing the nr_cpus_allowed
calculation). Since, due to priority boosting, we will need to calculate
then. Calculating it here is better.

 + p-sched_class-set_cpus_allowed(p,
new_mask);  +  else {
 + p-cpus_allowed= new_mask;
 + p-nr_cpus_allowed = cpus_weight(new_mask);
 + }
 +
   /* Can the task run on the task's current CPU? If so, we're done */
   if (cpu_isset(task_cpu(p), new_mask))
   goto out;
 diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
 index b59dc20..ad35c89 100644
 --- a/kernel/sched_rt.c
 +++ b/kernel/sched_rt.c
 @@ -50,6 +50,24 @@ static inline void update_curr_rt(struct rq *rq)
   curr-se.sum_exec_runtime += delta_exec;
   curr-se.exec_start = rq-clock;
  }
 +#ifdef CONFIG_SMP
 +static void inc_rt_migration(struct task_struct *p, struct rq *rq)
 +{
 + rq-rt.rt_nr_migratory++;
 +
 + if (rq-rt.rt_nr_running  1)
 + rt_set_overload(p, rq-cpu);
 +}
 +
 +static void dec_rt_migration(struct task_struct *p, struct rq *rq)
 +{
 + 

Re: Interrupt Latency module for intel

2007-10-25 Thread Steven Rostedt


Please don't top post.

--

On Thu, 25 Oct 2007, Jaswinder Singh wrote:

 Hello Daniel and Sven-Thorsten Dietrich,

 Can you please let me know the procedure of :
 1. high resolution timers from userspace

http://rt.wiki.kernel.org/index.php/Cyclictest

 2. ingo's irq latency timing and tracing

I need to write up some howtos for that.


 I will also try these methods and try to compare the performance numbers.


The cyclictest is fine for comparisons, but the latency trace is only
available with the -rt patch.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] RT: Cache cpus_allowed weight for optimizing migration

2007-10-25 Thread Steven Rostedt

--
On Thu, 25 Oct 2007, Gregory Haskins wrote:

  
   - p-cpus_allowed = new_mask;
   + if (p-sched_class-set_cpus_allowed)
 
  Not sure we need this optimization (not doing the nr_cpus_allowed
  calculation). Since, due to priority boosting, we will need to calculate
  then. Calculating it here is better.

 I think you are misunderstanding the code here.  The only optimization
 is that I didn't want to force every sched_class to define a default
 set_cpus_allowed member-fn.  So instead, it first checks if its defined
 and invokes it if true.  Else, the default behavior is to assign the
 mask and calculate the weight.

 If you look at the one and only implementation of this function
 (sched_rt.c:set_cpus_allowed_rt()), it also performs the assignment and
 calcs the mask.

 Or did I misunderstand your objection?

Actually, I say we do the calculation for all tasks regardless of class.
But you can have a function for each class (or NULL) that will do
something with the old and new values.

Reason why is that we don't want the calculation to happen during the
boosting of a tasks (when it goes from one class to another).


 
   + p-sched_class-set_cpus_allowed(p,
  new_mask);  +  else {
   + p-cpus_allowed= new_mask;
   + p-nr_cpus_allowed = cpus_weight(new_mask);
   + }
   +
 /* Can the task run on the task's current CPU? If so, we're done */
 if (cpu_isset(task_cpu(p), new_mask))
 goto out;


/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task,
   struct rq *this_rq)
   @@ -295,6 +334,28 @@ static struct rq *find_lock_lowest_rq(struct 
   task_struct *task,
 int cpu;
 int tries;
  
   + /*
   +  * We can optimize if the hamming weight of the cpus_allowed 
   mask
   +  * is 1 because the task has nowhere to go but one CPU.  So 
   don't
   +  * waste the time trying to find the lowest RQ in this case.
   +  */
 
  This code should be in the pick_next_highest_rt and not here.

 I disagree, but I admit it is not apparent at this level of the series
 why.  The reason has to do with optimizing the wakeup path.  Unless I am
 missing something, I think this placement is optimal, and that will
 hopefully become apparent when you see the rest of my series.

Then move the code there then. One can't be analyzing patches when code
isn't apparent that determines changes.  Those changes need to be
together.  Especially since they may not be applied.

I would be happy to apply most of this patch but because of changes that
are here to help out to-be-announced patches, will keep this patch from
going into the tree.

IOW, break the patch up to the fixes that this patch is to do on its own.
It has plenty. Leave out the stuff that will help out later patches.
Add the stuff that is being recommended, and you can make a separate patch
to move things around in the patch series that does all the TBA stuff.


 
   + if (task-nr_cpus_allowed == 1) {
   + /* If the task is already on the RQ, we are done */
   + if (cpu_isset(this_rq-cpu, task-cpus_allowed))
   + return NULL;
   +
   + cpu = first_cpu(task-cpus_allowed);
   +
   + lowest_rq = cpu_rq(cpu);
   + BUG_ON(this_rq == lowest_rq);
   +
   + /* Otherwise, we can simply grab the new RQ */
   + if (lock_migration_target(task, lowest_rq))
   + return lowest_rq;
   + else
   + return NULL;
   + }
   +
 cpus_and(cpu_mask, cpu_online_map, task-cpus_allowed);
  
 for (tries = 0; tries  RT_MAX_TRIES; tries++) {
   @@ -324,22 +385,8 @@ static struct rq *find_lock_lowest_rq(struct 
   task_struct *task,
 break;
  
 /* if the prio of this runqueue changed, try again */
   - if (double_lock_balance(this_rq, lowest_rq)) {
   - /*
   -  * We had to unlock the run queue. In
   -  * the mean time, task could have
   -  * migrated already or had its affinity changed.
   -  * Also make sure that it wasn't scheduled on its rq.
   -  */
   - if (unlikely(task_rq(task) != this_rq ||
   -  !cpu_isset(lowest_rq-cpu, 
   task-cpus_allowed) ||
   -  task_running(this_rq, task) ||
   -  !task-se.on_rq)) {
   - spin_unlock(lowest_rq-lock);
   - lowest_rq = NULL;
   - break;
   - }
   - }
   + if (!lock_migration_target(task, lowest_rq))
   + return NULL;
 
  I don't like this encapsulating of the doubl_lock_balance. There's a
  reason I 

Re: [PATCH 2/3] RT: Cache cpus_allowed weight for optimizing migration

2007-10-25 Thread Steven Rostedt

--
 @@ -5371,7 +5372,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t 
 new_mask)
   goto out;
   }

 - p-cpus_allowed = new_mask;
 + if (p-sched_class-set_cpus_allowed)
 + p-sched_class-set_cpus_allowed(p, new_mask);
 + else {
 + p-cpus_allowed= new_mask;
 + p-nr_cpus_allowed = cpus_weight(new_mask);
 + }
 +

OK, forget everything I said on this topic. I must have replied while low
on caffeine.

I see here that you do the set_cpus_allowed code if defined _ELSE_ you
just update the weight.

So you do do what I asked.

/me slaps himself to wakeup.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] RT: Cache cpus_allowed weight for optimizing migration

2007-10-25 Thread Steven Rostedt

--
On Thu, 25 Oct 2007, Gregory Haskins wrote:
  /* Will lock the rq it finds */
  static struct rq *find_lock_lowest_rq(struct task_struct *task,
 struct rq *this_rq)
 @@ -295,6 +334,28 @@ static struct rq *find_lock_lowest_rq(struct 
 task_struct *task,
   int cpu;
   int tries;

 + /*
 +  * We can optimize if the hamming weight of the cpus_allowed 
 mask
 +  * is 1 because the task has nowhere to go but one CPU.  So 
 don't
 +  * waste the time trying to find the lowest RQ in this case.
 +  */
   
This code should be in the pick_next_highest_rt and not here.
  
   I disagree, but I admit it is not apparent at this level of the series
   why.  The reason has to do with optimizing the wakeup path.  Unless I am
   missing something, I think this placement is optimal, and that will
   hopefully become apparent when you see the rest of my series.
 
  Then move the code there then. One can't be analyzing patches when code
  isn't apparent that determines changes.  Those changes need to be
  together.  Especially since they may not be applied.

 The correctness of this particular change (IMO) is not predicated on the
 later patches, otherwise I would have done just that.  It accomplishes
 what I intended as is here in this patch.

 Why do you think moving the logic to pick_next_highest is a better
 design?  To be honest, I haven't really studied your new logic in
 push_rt_tasks to understand why you might feel this way.  If you can
 make the case that it is better in the other location then I agree with
 you that we should move it there in this patch, and potentially adjust
 it later.  Until then, I see no problem with it being here.

Ah, after reading the comment in your code, I might know where our
miscommunication is from.  When you hit a task that can't migrate, you
simply stop and don't bother looking for a lowest rq to place it on.

I'm saying to do one better. Put the code in the pick_next_highest_task_rt
and _skip_ rt tasks with nr_cpus_allowed == 1. So we can then look to
migrate another RT task that is lower in priority than a bounded RT task.

Does this clear up what I'm trying to say?

BTW, stop looking for a lowest_rq isn't really an optimization here. Since
we only look at cpus that are in the tasks cpu affinity mask and we skip
the cpu that it currently is on. So we don't even take a lock in that
case.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.23-rt2

2007-10-24 Thread Steven Rostedt
We are pleased to announce the 2.6.23-rt2 tree, which can be
downloaded from the location:

 http://www.kernel.org/pub/linux/kernel/projects/rt/

Changes since 2.6.23-rt1

  - Improved RT balancing (Steven Rostedt
 with thanks to Gregory Haskins for some ideas)

  - plist debug fix (Arnaldo Carvalho de Melo)

  - seqlock irq fix (Daniel Walker)

  - slab NUMA freeing (Andi Kleen)


to build a 2.6.23-rt2 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.tar.bz2
  http://www.kernel.org/pub/linux/kernel/projects/rt/patch-2.6.23-rt2.bz2

The broken out patches are also available.

-- Steve







-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.23-rt3

2007-10-24 Thread Steven Rostedt
We are pleased to announce the 2.6.23-rt3 tree, which can be
downloaded from the location:

 http://www.kernel.org/pub/linux/kernel/projects/rt/

Changes since 2.6.23-rt2

  - Workqueue PI (Peter Zijlstra with help from Daniel Walker)

  - some schedstat updates (Steven Rostedt)

to build a 2.6.23-rt3 tree, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.23.tar.bz2
  http://www.kernel.org/pub/linux/kernel/projects/rt/patch-2.6.23-rt3.bz2

The broken out patches are also available.

-- Steve




-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: High 50us+ latencies in the process signal handling path

2007-10-24 Thread Steven Rostedt

--
On Fri, 19 Oct 2007, Sripathi Kodi wrote:
 
  Can this explain it?

 We have changed the sender to be of priority 1 less than the receiver. This
 brings down the frequency of seeing high latencies, especially if there is a
 background load, but does not eliminate them. In fact the logs Ankita posted
 were with the sender at this RT priority.


The latest -rt code has new RT balancing. Could you see if 2.6.23-rt3
fixes the issues for you.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: thread load balancing on dual CPU Multicore AMD64 system

2007-10-24 Thread Steven Rostedt

--

On Thu, 18 Oct 2007, Gernot Hillier wrote:

 Hi!

 We're currently evaluating whether PREEMPT_RT will work for a certain
 use case combining realtime and performance requirements running on a
 lot of CPUs and using a bunch of RAM.

 For first tests, we're running a small AMD64 test system with 2x2
 cores (2 CPUs with 2 cores each) with 8 GB of RAM.

 We wrote a small testcase which basically has one SCHED_FIFO realtime
 thread which does nothing but sleeping and checking if it wakes up at
 the right time. In addition, it spawns 20 low-prio load threads
 introducing a lot of malloc/memory access/free load on some GB of RAM.

 We can see, that the realtime requirements are fulfilled quite well (if
 using the current glibc with private futexes, but that's another story).
 The rt thread reacts within the expected timeframe with 2.6.22.1-rt9
 as well as with 2.6.23-rt1.

 However, what causes problems is the load balancing of the 20 threads
 over the available CPU cores:


The latest 2.6.23-rt3 (as well as -rt2) has new RT balancing code. Could
you try that to see if it solves you issues.

Thanks,

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Interrupt Latency module for intel

2007-10-24 Thread Steven Rostedt

--
On Sat, 20 Oct 2007, Jaswinder Singh wrote:

 hello all,

 On my Intel Pentium 4 with linux-2.6.23-rc8-rt1, I am getting :

 Interrupt Latency result
 
 MAX latency : 12.570 micro-seconds
 MIN latency : 3.352 micro-seconds
 AVG latency : 4.190 micro-seconds
 Total Samples : 32583

 Please review this interrupt latency module and give your feedback.

Wow 12.570 ms latencies!

Heh, thanks for showing that dynamic ticks work.

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -v2 4/7] RT overloaded runqueues accounting

2007-10-23 Thread Steven Rostedt

--
On Mon, 22 Oct 2007, Paul Menage wrote:

 On 10/22/07, Paul Jackson [EMAIL PROTECTED] wrote:
  Steven wrote:
   +void cpuset_rt_set_overload(struct task_struct *tsk, int cpu)
   +{
   + cpu_set(cpu, task_cs(tsk)-rt_overload);
   +}
 
  Question for Steven:
 
What locks are held when cpuset_rt_set_overload() is called?

Right now only the rq lock. I know it's not enough. This needs to be
fixed. These are called in the heart of the scheduler so...

 
  Questions for Paul Menage:
 
Does 'tsk' need to be locked for the above task_cs() call?

 Cgroups doesn't change the locking rules for accessing a cpuset from a
 task - you have to have one of:

 - task_lock(task)

I'm not sure of the lock nesting between task_lock(task) and rq-lock.
If this nesting doesn't yet exist, then I can add code to do the
task_lock.  But if the rq-lock is called within the task_lock somewhere,
then this can't be used.


 - callback_mutex

Can schedule, so it's definitely out.


 - be in an RCU section from the point when you call task_cs to the
 point when you stop using its result. (Additionally, in this case
 there's no guarantee that the task stays in this cpuset for the
 duration of the RCU section).

This may also be an option. Although with interrupts disabled for the
entire time the cpusets are used should keep RCU grace periods from moving
forward.

But let me give you two some background to what I'm trying to solve. And
then I'll get to where cpusets come in.

Currently the mainline vanilla kernel does not handle migrating RT tasks
well. And this problem also exists (but not as badly) in the -rt patch.
When a RT task is queued on a CPU that is running an even higher priority
RT task, it should be pushed off to another CPU that is running a lower
priority task. But this does not always happen and a RT task may take
several milliseconds before it gets a chance to run.

This latancy is not acceptible for RT tasks. So I added logic to push and
pull RT tasks to and from CPUS.  The push happens when a RT task wakes up
and can't preempt the RT task running on the same CPU, or when a lower RT
task is preempted by a higher one. The lower may be pushed to another CPU.

This alone is not enough to cover RT migration. We also need to pull RT
tasks to a CPU if that CPU is lowering its priority (a high priority RT
task has just went to sleep).

Idealy, and when CONFIG_CPUSETS is not defined, I keep track of all CPUS
that have more than one RT task queued to run on it.  This I call an RT
overload.  There's an RT overload bitmask that keeps track of the CPUS
that have more than one RT task queued.  When a CPU stops running a high
priority RT task, a search is made of all the CPUS that are in the RT
overload state to see if there exists a RT task that can migrate to the
CPU that is lowering its priority.

Ingo Molnar and Peter Zijlstra pointed out to me that this global cpumask
would kill performance on 64 CPU boxes due to cacheline bouncing. To
solve this issue, I placed the RT overload mask into the cpusets.

Ideally, the RT overload mask would keep track of all CPUS that have tasks
that can run on a given CPU. In-other-words, a cpuset from the point of
view of the CPU (or runqueue).  But this is overkill for the RT migration
code.  Large # CPU boxes probably don't have the RT balancing problems
that small # CPU boxes have, since the CPU resource is greater to run RT
tasks on.

Using cpusets seemed to be a nice place to add functionality to keep the
RT overload code from crippling large # CPU boxes.

The RT overload mask is bound to the CPU (runqueue) and not to the task.
But to get to the cpuset, I needed to go through the task. The task I used
was whatever was currently running on the given CPU, or sometimes the task
that was being pushed to a CPU.

Due to overlapping cpusets, there can be inconsistencies between the RT
overload mask and actual CPUS that are in the overload state. This is
tolerated, as the more CPUS you have, the less of a problem it should be
to have overloaded CPUS. Remember, the push task doesn't use the overload.
Only the pull does, and that happens when a push didn't succeed. With more
CPUS, pushes are more likely to succeed.

So the switch to use cpusets was to keep the RT balancing code from
hurting large SMP boxes than for actually being correct on those boxes.
The RT balance is much more important when the CPU resource is limited.
The cpusets were picked just because it seemed resonable that most of the
time a cpuset of one task on a runqueue would equal that of another task
on the same runqueue. But the code is good enough if that's not the
case.

My code can handle inconsistencies between the RT overload mask and actual
overloaded CPUS. So what I need to really protect with regards to cpusets
is from them disappearing and causing an oops.  Whether or not a task
comes and goes from a cpuset is not the important part.  The RT balance
code only uses the cpuset to determine what other RT tasks are 

Re: [patch 6/8] pull RT tasks

2007-10-22 Thread Steven Rostedt

Hi Dmitry,

On Sun, 2007-10-21 at 11:35 +0200, Dmitry Adamushko wrote:
 Hi Steven,
 
  When a pull RT is initiated, all overloaded runqueues are examined for
  a RT task that is higher in prio than the highest prio task queued on the
  target runqueue. If another runqueue holds a RT task that is of higher
  prio than the highest prio task on the target runqueue is found it is pulled
  to the target runqueue.
 
 I think, 2 things should be accomplished here :
 
 (1) be sure to pull the _highest_ prio task ;
 
 i.e. the _highest_ prio task amongst all runnable (but not running) RT
 tasks across all the run-queues which is capable of running on
 this_cpu ;
 
 (2) don't pull more than 1 task at once.

I've thought about this, and played a little. The problem we have is
that we don't take locks when searching each run queue. So by the time
you got the highest rq to pull from, it may no longer be the highest.
So what do we do in that case? search again?

If we are lucky and pull the highest first, then we will not pull
another one. Since we are not comparing the rq prio to current, but to
the highest task on the current rq.  So once we pull a high prio task,
we only pull another one if we find a even higher prio task.

Yes, it's a little inefficient, and can cause shuffling of task around.
But these tasks haven't actually run yet, so it isn't too much harm. But
the benefit is that when we finish, we have the highest task that can
run on the runqueue.

 
 that said, just pull the highest prio task and run it.
 
 ---
 
 why (2)? Just to avoid situations when tasks are being pulled/pushed
 back and forth between run-queues.
 
 Let's say we have 4 cpu system:
 
 0:  task(10) , task(92)
 1:  task(10), task(91)
 2:  task(10), task(90)
 3:  task(10)
 
 when task(10) on cpu#3 is inactive, we pull task(92), task(91),
 task(90) and then run task(90)... in the mean time, some of cpu[0..2]
 becomes inactive and pull task(91) and task(92) back and run them...
 that may repeat again and again depending on when/how long task(10)
 run on their corresponding cpus...

I'm not sure that's too much of an issue, since we are just switch tasks
on lists. As long has they haven't run yet, then it shouldn't be too
much harm. We do have a little bit of cache hits in the queues
themselves, but alternatives to fix this would probably cause the same
effects.

 
 so it seems to me that the more optimal behavior would be don't pull
 more than you can run at the moment -- that's 1.
 
 to this goal, something like find_lock_highest_rq() would be necessary.

The problem is that we have too many races to find the highest rq at the
moment. But by pulling the highest at the time, we should end up with
what we want.

 
 and I guess, {get,put}_task_struct() should be used in pull_rt_task()
 for the 'next' in a similar way as it's done in push_rt_task() .

No, we don't need to do the get task on pull. With push, we start with a
task and we want to push it somewhere. On the double_lock_balance, we
might lose our rq lock. Which means that next could have been put on
another run queue, run to completion, and then exited destroying the
task. We still reference that task after the double_lock_balance, and if
it is not active anymore, we finish the push.

With the pull, we are focusing on the run queue. If we had to release
the rq lock on double_lock_balance, we just pick the highest next on
the current run queue (which could have changed) and take the current
task on the src rq, which also could have changed. If the current task
on the src rq is higher in priority we move it over to the dst rq. So no
get/put_task_struct is needed.

-- Steve


-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 6/8] pull RT tasks

2007-10-22 Thread Steven Rostedt

--
On Tue, 23 Oct 2007, Dmitry Adamushko wrote:

 Hi Steven,

 agreed with your comments in the previous message. Indeed, I missed some 
 points.

  On wakeup, we can wake up several RT tasks (as my test case does) and if
  we only push one task, then the other tasks may not migrate over to the
  other run queues. I logged this happening in my tests.

 I guess, what may happen is that while we are running push_rt_tasks()
 on CPU-k (say, as a result on try_to_wake_up(task_1))
 and as this_rq-lock may get released (in double_lock_balance()) , we
 may get in a 'race'
 with try_to_wake_up(task_2) from (another) CPU-m.
 It places a woken up task on the same run-queue (for which
 push_rt_task() is already running on CPU-k) and, actually, run
 push_rt_task() for the same rq again!

 So it may happen that both task_1 and task_2 will be pushed from the same 
 CPU...

 Do you see an error in my description? (it's a late hour so I can miss
 something again ... sure, otherwise I'm almost perfect :-/ ) Can it
 correlate with what you have observed in your tests?

 Otherwise, there is 1:1 relation : push_rt_task() is called for every
 new (single) task activated by try_to_wake_up() and for a preempted
 task... so it's not like a few tasks are placed on the run-queue and
 then push_rt_tasks() is called once.

 btw., if this scenario may take place... maybe it would make sense to
 have something like RTLB_INPROGRESS/PENDING and to avoid competing
 push_rt_tasks() calls for the same 'rq' from different CPUs?
 (although, there can be some disadvantages here as well. e.g. we would
 likely need to remove 'max 3 tasks at once' limit and get,
 theoretically, unbounded time spent in push_rt_tasks() on a single
 CPU).

I think I see what you're saying now. That we really should do the push on
wakeup, since only the one rt task that is woken up will be able to be
pushed.

My code may seem a bit aggressive, but from logging, I see that we seldom
push more than one task. But if we don't go ahead and push more than one,
the rt-migrate-test will sometimes fail.  There's funny cases when we need
to push more than one. I can't remember exactly what they were, but
sometimes because of races between the pushes and pulls where one will
miss the fact that an RT process is being queued while its searching to
pull a task, but the push will catch it. Or vice versa.

The max tries is just a paranoid case where I don't want heavy scheduling
to cause an unbounded trying to push or pull tasks. According to the
logging while running the rt-migrate-tests, the loop there repeated
approximately 1 in 20, and iterated twice 1 in 100 and hit the third loop
twice in all my tests (over a 1000 iterations being logged). But logging
slows it down and modifies the results itself, and perhaps I'll add
statistics soon.

I'm about to post a second batch of patches.

Thanks,

-- Steve

-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -v2 5/7] pull RT tasks

2007-10-22 Thread Steven Rostedt
This patch adds the algorithm to pull tasks from RT overloaded runqueues.

When a pull RT is initiated, all overloaded runqueues are examined for
a RT task that is higher in prio than the highest prio task queued on the
target runqueue. If another runqueue holds a RT task that is of higher
prio than the highest prio task on the target runqueue is found it is pulled
to the target runqueue.

Signed-off-by: Steven Rostedt [EMAIL PROTECTED]

---
 kernel/sched.c|2 
 kernel/sched_rt.c |  196 ++
 2 files changed, 187 insertions(+), 11 deletions(-)

Index: linux-test.git/kernel/sched.c
===
--- linux-test.git.orig/kernel/sched.c  2007-10-22 22:31:59.0 -0400
+++ linux-test.git/kernel/sched.c   2007-10-22 22:37:28.0 -0400
@@ -3622,6 +3622,8 @@ need_resched_nonpreemptible:
switch_count = prev-nvcsw;
}
 
+   schedule_balance_rt(rq, prev);
+
if (unlikely(!rq-nr_running))
idle_balance(cpu, rq);
 
Index: linux-test.git/kernel/sched_rt.c
===
--- linux-test.git.orig/kernel/sched_rt.c   2007-10-22 22:32:18.0 
-0400
+++ linux-test.git/kernel/sched_rt.c2007-10-22 22:37:28.0 -0400
@@ -168,8 +168,17 @@ static void put_prev_task_rt(struct rq *
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+   if (!task_running(rq, p) 
+   (cpu  0 || cpu_isset(cpu, p-cpus_allowed)))
+   return 1;
+   return 0;
+}
+
 /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
+int cpu)
 {
struct rt_prio_array *array = rq-rt.active;
struct task_struct *next;
@@ -188,26 +197,36 @@ static struct task_struct *pick_next_hig
}
 
queue = array-queue + idx;
+   BUG_ON(list_empty(queue));
+
next = list_entry(queue-next, struct task_struct, run_list);
-   if (unlikely(next != rq-curr))
-   return next;
+   if (unlikely(pick_rt_task(rq, next, cpu)))
+   goto out;
 
if (queue-next-next != queue) {
/* same prio task */
next = list_entry(queue-next-next, struct task_struct, 
run_list);
-   return next;
+   if (pick_rt_task(rq, next, cpu))
+   goto out;
}
 
+ retry:
/* slower, but more flexible */
idx = find_next_bit(array-bitmap, MAX_RT_PRIO, idx+1);
-   if (unlikely(idx = MAX_RT_PRIO)) {
-   WARN_ON(1); /* rt_nr_running was 2 and above! */
+   if (unlikely(idx = MAX_RT_PRIO))
return NULL;
-   }
 
queue = array-queue + idx;
-   next = list_entry(queue-next, struct task_struct, run_list);
+   BUG_ON(list_empty(queue));
 
+   list_for_each_entry(next, queue, run_list) {
+   if (pick_rt_task(rq, next, cpu))
+   goto out;
+   }
+
+   goto retry;
+
+ out:
return next;
 }
 
@@ -296,13 +315,15 @@ static int push_rt_task(struct rq *this_
 
assert_spin_locked(this_rq-lock);
 
-   next_task = pick_next_highest_task_rt(this_rq);
+   next_task = pick_next_highest_task_rt(this_rq, -1);
if (!next_task)
return 0;
 
  retry:
-   if (unlikely(next_task == this_rq-curr))
+   if (unlikely(next_task == this_rq-curr)) {
+   WARN_ON(1);
return 0;
+   }
 
/*
 * It's possible that the next_task slipped in of
@@ -326,7 +347,7 @@ static int push_rt_task(struct rq *this_
 * so it is possible that next_task has changed.
 * If it has, then try again.
 */
-   task = pick_next_highest_task_rt(this_rq);
+   task = pick_next_highest_task_rt(this_rq, -1);
if (unlikely(task != next_task)  task  paranoid--) {
put_task_struct(next_task);
next_task = task;
@@ -371,6 +392,158 @@ static void push_rt_tasks(struct rq *rq)
;
 }
 
+static int pull_rt_task(struct rq *this_rq)
+{
+   struct task_struct *next;
+   struct task_struct *p;
+   struct rq *src_rq;
+   cpumask_t rto_cpumask;
+   int this_cpu = this_rq-cpu;
+   int cpu;
+   int ret = 0;
+
+   assert_spin_locked(this_rq-lock);
+
+   /*
+* If cpusets are used, and we have overlapping
+* run queue cpusets, then this algorithm may not catch all.
+* This is just the price you pay on trying

[PATCH -v2 0/7] New RT Task Balancing -v2

2007-10-22 Thread Steven Rostedt
[
  Changes since V1:
Updated to git tree 55b70a0300b873c0ec7ea6e33752af56f41250ce

Various clean ups suggested by Gregory Haskins, Dmitry Adamushko,
and Peter Zijlstra.

Biggest change was recommended by Ingo Molnar. This is the use of cpusets
for keeping track of RT overloaded CPUS.  When CONFIG_CPUSETS is not
defined, we have a single global rt_overload_mask that keeps track
of the runqueues with more than one RT task queued. When CONFIG_CPUSETS
is configured, that bitmask is stored in the cpusets.

Note that in the case of overlapping cpusets it is possible to have
inconsistent data between the bitmask and actual RT overloaded runqueues.
The worst that can happen is that a task doesn't get moved quickly
over to a runnable CPU.  But this is a price we pay to keep from
dirtying caches for large number of CPU boxes. If this does happen
it gets cleaned up rather quickly since there are checks for
RT overload bits being set when they shouldn't be.

For most systems this is not an issue since a single cpuset is used.
]

Currently in mainline the balancing of multiple RT threads is quite broken.
That is to say that a high priority thread that is scheduled on a CPU
with a higher priority thread, may need to unnecessarily wait while it
can easily run on another CPU that's running a lower priority thread.

Balancing (or migrating) tasks in general is an art. Lots of considerations
must be taken into account. Cache lines, NUMA and more. This is true
with general processes which expect high through put and migration can
be done in batch.  But when it comes to RT tasks, we really need to
put them off to a CPU that they can run on as soon as possible. Even
if it means a bit of cache line flushing.

Right now an RT task can wait several milliseconds before it gets scheduled
to run. And perhaps even longer. The migration thread is not fast enough
to take care of RT tasks.

To demonstrate this, I wrote a simple test.
 
  http://rostedt.homelinux.com/rt/rt-migrate-test.c

  (gcc -o rt-migrate-test rt-migrate-test.c -lpthread)

This test expects a parameter to pass in the number of threads to create.
If you add the '-c' option (check) it will terminate if the test fails
one of the iterations. If you add this, pass in +1 threads.

For example, on a 4 way box, I used

  rt-migrate-test -c 5

What this test does is to create the number of threads specified (in this
case 5). Each thread is set as an RT FIFO task starting at a specified
prio (default 2) and each thread being one priority higher. So with this
example the 5 threads created are at priorities 2, 3, 4, 5, and 6.

The parent thread sets its priority to one higher than the highest of
the children (this example 7). It uses pthread_barrier_wait to synchronize
the threads.  Then it takes a time stamp and starts all the threads.
The threads when woken up take a time stamp and compares it to the parent
thread to see how long it took to be awoken. It then runs for an
interval (20ms default) in a busy loop. The busy loop ends when it reaches
the interval delta from the start time stamp. So if it is preempted, it
may not actually run for the full interval. This is expected behavior
of the test.

The numbers recorded are the delta from the thread's time stamp from the
parent time stamp. The number of iterations it ran the busy loop for, and
the delta from a thread time stamp taken at the end of the loop to the
parent time stamp.

Sometimes a lower priority task might wake up before a higher priority,
but this is OK, as long as the higher priority process gets the CPU when
it is awoken.

At the end of the test, the iteration data is printed to stdout. If a
higher priority task had to wait for a lower one to finish running, then
this is considered a failure. Here's an example of the output from
a run against git commit 4fa4d23fa20de67df919030c1216295664866ad7.

   1:   36  33   20041  39  33
 len:20036   20033   40041   20039   20033
 loops: 167789  167693  227167  167829  167814

On iteration 1 (starts at 0) the third task started at 20ms after the parent
woke it up. We can see here that the first two tasks ran to completion
before the higher priority task was even able to start. That is a
20ms latency for the higher priority task!!!

So people who think that their audio would lose most latencies by upping 
the priority, may be in for a surprise. Since some kernel threads (like
the migration thread itself) may cause this latency.

To solve this issue, I've changed the RT task balancing from a passive
method (migration thread) to an active method.  This new method is
to actively push or pull RT tasks when they are woken up or scheduled.

On wake up of a task if it is an RT task, and there's already an RT task
of higher priority running on its runqueue, we initiate a push_rt_tasks
algorithm. This algorithm looks at the highest non-running RT task
and tries to find a CPU where it can run on. It 

[PATCH -v2 6/7] wake up balance RT

2007-10-22 Thread Steven Rostedt
This patch adds pushing of overloaded RT tasks from a runqueue that is
having tasks (most likely RT tasks) added to the run queue.

TODO: We don't cover the case of waking of new RT tasks (yet).

Signed-off-by: Steven Rostedt [EMAIL PROTECTED]
---
 kernel/sched.c|2 ++
 kernel/sched_rt.c |   12 
 2 files changed, 14 insertions(+)

Index: linux-test.git/kernel/sched_rt.c
===
--- linux-test.git.orig/kernel/sched_rt.c   2007-10-22 22:37:28.0 
-0400
+++ linux-test.git/kernel/sched_rt.c2007-10-22 22:38:04.0 -0400
@@ -559,9 +559,21 @@ static void schedule_tail_balance_rt(str
spin_unlock_irq(rq-lock);
}
 }
+
+static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
+{
+   if (unlikely(rt_task(p)) 
+   (p-prio = rq-curr-prio)) {
+   /* pull_rt_task needs task to be running */
+   p-state = TASK_RUNNING;
+   push_rt_tasks(rq);
+   }
+}
+
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)  do { } while (0)
 # define schedule_balance_rt(rq)   do { } while (0)
+# define wakeup_balance_rt(rq, p)  do { } while (0)
 #endif /* CONFIG_SMP */
 
 
Index: linux-test.git/kernel/sched.c
===
--- linux-test.git.orig/kernel/sched.c  2007-10-22 22:37:28.0 -0400
+++ linux-test.git/kernel/sched.c   2007-10-22 22:38:04.0 -0400
@@ -22,6 +22,7 @@
  *  by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  2007-10-22  RT overload balancing by Steven Rostedt
  */
 
 #include linux/mm.h
@@ -1614,6 +1615,7 @@ out_activate:
update_rq_clock(rq);
activate_task(rq, p, 1);
check_preempt_curr(rq, p);
+   wakeup_balance_rt(rq, p);
success = 1;
 
 out_running:

-- 
-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -v2 1/7] Add rt_nr_running accounting

2007-10-22 Thread Steven Rostedt
This patch adds accounting to keep track of the number of RT tasks running
on a runqueue. This information will be used in later patches.

Signed-off-by: Steven Rostedt [EMAIL PROTECTED]

---
 kernel/sched.c|1 +
 kernel/sched_rt.c |   17 +
 2 files changed, 18 insertions(+)

Index: linux-test.git/kernel/sched.c
===
--- linux-test.git.orig/kernel/sched.c  2007-10-22 22:18:53.0 -0400
+++ linux-test.git/kernel/sched.c   2007-10-22 22:31:51.0 -0400
@@ -266,6 +266,7 @@ struct rt_rq {
struct rt_prio_array active;
int rt_load_balance_idx;
struct list_head *rt_load_balance_head, *rt_load_balance_curr;
+   unsigned long rt_nr_running;
 };
 
 /*
Index: linux-test.git/kernel/sched_rt.c
===
--- linux-test.git.orig/kernel/sched_rt.c   2007-10-22 22:18:53.0 
-0400
+++ linux-test.git/kernel/sched_rt.c2007-10-22 22:31:51.0 -0400
@@ -25,12 +25,27 @@ static void update_curr_rt(struct rq *rq
curr-se.exec_start = rq-clock;
 }
 
+static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
+{
+   WARN_ON(!rt_task(p));
+   rq-rt.rt_nr_running++;
+}
+
+static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
+{
+   WARN_ON(!rt_task(p));
+   WARN_ON(!rq-rt.rt_nr_running);
+   rq-rt.rt_nr_running--;
+}
+
 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 {
struct rt_prio_array *array = rq-rt.active;
 
list_add_tail(p-run_list, array-queue + p-prio);
__set_bit(p-prio, array-bitmap);
+
+   inc_rt_tasks(p, rq);
 }
 
 /*
@@ -45,6 +60,8 @@ static void dequeue_task_rt(struct rq *r
list_del(p-run_list);
if (list_empty(array-queue + p-prio))
__clear_bit(p-prio, array-bitmap);
+
+   dec_rt_tasks(p, rq);
 }
 
 /*

-- 
-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -v2 2/7] track highest prio queued on runqueue

2007-10-22 Thread Steven Rostedt
This patch adds accounting to each runqueue to keep track of the
highest prio task queued on the run queue. We only care about
RT tasks, so if the run queue does not contain any active RT tasks
its priority will be considered MAX_RT_PRIO.

This information will be used for later patches.

Signed-off-by: Steven Rostedt [EMAIL PROTECTED]

---
 kernel/sched.c|3 +++
 kernel/sched_rt.c |   18 ++
 2 files changed, 21 insertions(+)

Index: linux-test.git/kernel/sched.c
===
--- linux-test.git.orig/kernel/sched.c  2007-10-22 22:31:51.0 -0400
+++ linux-test.git/kernel/sched.c   2007-10-22 22:31:55.0 -0400
@@ -267,6 +267,8 @@ struct rt_rq {
int rt_load_balance_idx;
struct list_head *rt_load_balance_head, *rt_load_balance_curr;
unsigned long rt_nr_running;
+   /* highest queued rt task prio */
+   int highest_prio;
 };
 
 /*
@@ -6725,6 +6727,7 @@ void __init sched_init(void)
rq-cpu = i;
rq-migration_thread = NULL;
INIT_LIST_HEAD(rq-migration_queue);
+   rq-rt.highest_prio = MAX_RT_PRIO;
 #endif
atomic_set(rq-nr_iowait, 0);
 
Index: linux-test.git/kernel/sched_rt.c
===
--- linux-test.git.orig/kernel/sched_rt.c   2007-10-22 22:31:51.0 
-0400
+++ linux-test.git/kernel/sched_rt.c2007-10-22 22:31:55.0 -0400
@@ -29,6 +29,10 @@ static inline void inc_rt_tasks(struct t
 {
WARN_ON(!rt_task(p));
rq-rt.rt_nr_running++;
+#ifdef CONFIG_SMP
+   if (p-prio  rq-rt.highest_prio)
+   rq-rt.highest_prio = p-prio;
+#endif /* CONFIG_SMP */
 }
 
 static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
@@ -36,6 +40,20 @@ static inline void dec_rt_tasks(struct t
WARN_ON(!rt_task(p));
WARN_ON(!rq-rt.rt_nr_running);
rq-rt.rt_nr_running--;
+#ifdef CONFIG_SMP
+   if (rq-rt.rt_nr_running) {
+   struct rt_prio_array *array;
+
+   WARN_ON(p-prio  rq-rt.highest_prio);
+   if (p-prio == rq-rt.highest_prio) {
+   /* recalculate */
+   array = rq-rt.active;
+   rq-rt.highest_prio =
+   sched_find_first_bit(array-bitmap);
+   } /* otherwise leave rq-highest prio alone */
+   } else
+   rq-rt.highest_prio = MAX_RT_PRIO;
+#endif /* CONFIG_SMP */
 }
 
 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)

-- 
-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -v2 4/7] RT overloaded runqueues accounting

2007-10-22 Thread Steven Rostedt
This patch adds an RT overload accounting system. When a runqueue has
more than one RT task queued, it is marked as overloaded. That is that it
is a candidate to have RT tasks pulled from it.

If CONFIG_CPUSET is defined, then an rt overloaded CPU bitmask is created
in the cpusets.  The algorithm for pulling tasks is limited to the cpuset
of the current task on the runqueue. Because of overlapping cpusets, it is
possible that the bitmask may get out of sync with actual overloaded RT
runqueues. But it wont cause any real harm. The worst that can happen is
that a RT task may not migrate to a CPU that it can run on when it could.
But that's a OK price to pay to keep the accounting simple and not kill
the cache on large SMP boxes.

When CONFIG_CPUSET is not set, then a single RT overload CPU mask is used.

Signed-off-by: Steven Rostedt [EMAIL PROTECTED]
---
 include/linux/cpuset.h |6 
 kernel/cpuset.c|   62 +
 kernel/sched_rt.c  |   29 ++
 3 files changed, 97 insertions(+)

Index: linux-test.git/kernel/sched_rt.c
===
--- linux-test.git.orig/kernel/sched_rt.c   2007-10-22 22:31:59.0 
-0400
+++ linux-test.git/kernel/sched_rt.c2007-10-22 22:32:18.0 -0400
@@ -3,6 +3,31 @@
  * policies)
  */
 
+#ifdef CONFIG_CPUSETS
+# define rt_overload(p) cpuset_rt_overload(p)
+# define rt_set_overload(p, cpu) cpuset_rt_set_overload(p, cpu)
+# define rt_clear_overload(p, cpu) cpuset_rt_clear_overload(p, cpu)
+# define rt_overloaded(p) cpuset_rt_overloaded(p)
+#else
+static cpumask_t rt_overload_mask;
+static inline int rt_overloaded(struct task_struct *p)
+{
+   return !cpus_empty(rt_overload_mask);
+}
+static inline cpumask_t rt_overload(struct task_struct *p)
+{
+   return rt_overload_mask;
+}
+static inline void rt_set_overload(struct task_struct *p, int cpu)
+{
+   cpu_set(cpu, rt_overload_mask);
+}
+static inline void rt_clear_overload(struct task_struct *p, int cpu)
+{
+   cpu_clear(cpu, rt_overload_mask);
+}
+#endif
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -32,6 +57,8 @@ static inline void inc_rt_tasks(struct t
 #ifdef CONFIG_SMP
if (p-prio  rq-rt.highest_prio)
rq-rt.highest_prio = p-prio;
+   if (rq-rt.rt_nr_running  1)
+   rt_set_overload(p, rq-cpu);
 #endif /* CONFIG_SMP */
 }
 
@@ -53,6 +80,8 @@ static inline void dec_rt_tasks(struct t
} /* otherwise leave rq-highest prio alone */
} else
rq-rt.highest_prio = MAX_RT_PRIO;
+   if (rq-rt.rt_nr_running  2)
+   rt_clear_overload(p, rq-cpu);
 #endif /* CONFIG_SMP */
 }
 
Index: linux-test.git/include/linux/cpuset.h
===
--- linux-test.git.orig/include/linux/cpuset.h  2007-10-22 22:18:53.0 
-0400
+++ linux-test.git/include/linux/cpuset.h   2007-10-22 22:32:18.0 
-0400
@@ -78,6 +78,12 @@ extern void cpuset_track_online_nodes(vo
 
 extern int current_cpuset_is_being_rebound(void);
 
+/* The cpuset_rt_overload code is only used when CONFIG_CPUSETS is defined */
+extern int cpuset_rt_overloaded(struct task_struct *tsk);
+extern void cpuset_rt_set_overload(struct task_struct *tsk, int cpu);
+extern cpumask_t cpuset_rt_overload(struct task_struct *tsk);
+extern void cpuset_rt_clear_overload(struct task_struct *tsk, int cpu);
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
Index: linux-test.git/kernel/cpuset.c
===
--- linux-test.git.orig/kernel/cpuset.c 2007-10-22 22:18:53.0 -0400
+++ linux-test.git/kernel/cpuset.c  2007-10-22 22:36:29.0 -0400
@@ -84,6 +84,9 @@ struct cpuset {
cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed;/* Memory Nodes allowed to tasks */
 
+   /* bits protected by rq locks. */
+   cpumask_t rt_overload;  /* runqueue overload mask */
+
struct cpuset *parent;  /* my parent */
 
/*
@@ -179,6 +182,7 @@ static struct cpuset top_cpuset = {
.flags = ((1  CS_CPU_EXCLUSIVE) | (1  CS_MEM_EXCLUSIVE)),
.cpus_allowed = CPU_MASK_ALL,
.mems_allowed = NODE_MASK_ALL,
+   .rt_overload = CPU_MASK_NONE,
 };
 
 /*
@@ -1566,6 +1570,7 @@ static void cpuset_post_clone(struct cgr
 
cs-mems_allowed = parent_cs-mems_allowed;
cs-cpus_allowed = parent_cs-cpus_allowed;
+   cs-rt_overload = parent_cs-rt_overload;
return;
 }
 
@@ -1604,6 +1609,7 @@ static struct cgroup_subsys_state *cpuse
set_bit(CS_SCHED_LOAD_BALANCE, cs-flags);
cs-cpus_allowed = CPU_MASK_NONE;
cs-mems_allowed = NODE_MASK_NONE;
+   cs-rt_overload = CPU_MASK_NONE

[PATCH -v2 7/7] disable CFS RT load balancing.

2007-10-22 Thread Steven Rostedt
Since we now take an active approach to load balancing, we don't need to
balance RT tasks via CFS. In fact, this code was found to pull RT tasks
away from CPUS that the active movement performed, resulting in
large latencies.

Signed-off-by: Steven Rostedt [EMAIL PROTECTED]
---
 kernel/sched_rt.c |   91 +-
 1 file changed, 2 insertions(+), 89 deletions(-)

Index: linux-test.git/kernel/sched_rt.c
===
--- linux-test.git.orig/kernel/sched_rt.c   2007-10-22 22:38:04.0 
-0400
+++ linux-test.git/kernel/sched_rt.c2007-10-22 22:38:33.0 -0400
@@ -576,101 +576,14 @@ static void wakeup_balance_rt(struct rq 
 # define wakeup_balance_rt(rq, p)  do { } while (0)
 #endif /* CONFIG_SMP */
 
-
-/*
- * Load-balancing iterator. Note: while the runqueue stays locked
- * during the whole iteration, the current task might be
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
- */
-static struct task_struct *load_balance_start_rt(void *arg)
-{
-   struct rq *rq = arg;
-   struct rt_prio_array *array = rq-rt.active;
-   struct list_head *head, *curr;
-   struct task_struct *p;
-   int idx;
-
-   idx = sched_find_first_bit(array-bitmap);
-   if (idx = MAX_RT_PRIO)
-   return NULL;
-
-   head = array-queue + idx;
-   curr = head-prev;
-
-   p = list_entry(curr, struct task_struct, run_list);
-
-   curr = curr-prev;
-
-   rq-rt.rt_load_balance_idx = idx;
-   rq-rt.rt_load_balance_head = head;
-   rq-rt.rt_load_balance_curr = curr;
-
-   return p;
-}
-
-static struct task_struct *load_balance_next_rt(void *arg)
-{
-   struct rq *rq = arg;
-   struct rt_prio_array *array = rq-rt.active;
-   struct list_head *head, *curr;
-   struct task_struct *p;
-   int idx;
-
-   idx = rq-rt.rt_load_balance_idx;
-   head = rq-rt.rt_load_balance_head;
-   curr = rq-rt.rt_load_balance_curr;
-
-   /*
-* If we arrived back to the head again then
-* iterate to the next queue (if any):
-*/
-   if (unlikely(head == curr)) {
-   int next_idx = find_next_bit(array-bitmap, MAX_RT_PRIO, idx+1);
-
-   if (next_idx = MAX_RT_PRIO)
-   return NULL;
-
-   idx = next_idx;
-   head = array-queue + idx;
-   curr = head-prev;
-
-   rq-rt.rt_load_balance_idx = idx;
-   rq-rt.rt_load_balance_head = head;
-   }
-
-   p = list_entry(curr, struct task_struct, run_list);
-
-   curr = curr-prev;
-
-   rq-rt.rt_load_balance_curr = curr;
-
-   return p;
-}
-
 static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_nr_move, unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio)
 {
-   int nr_moved;
-   struct rq_iterator rt_rq_iterator;
-   unsigned long load_moved;
-
-   rt_rq_iterator.start = load_balance_start_rt;
-   rt_rq_iterator.next = load_balance_next_rt;
-   /* pass 'busiest' rq argument into
-* load_balance_[start|next]_rt iterators
-*/
-   rt_rq_iterator.arg = busiest;
-
-   nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
-   max_load_move, sd, idle, all_pinned, load_moved,
-   this_best_prio, rt_rq_iterator);
-
-   return load_moved;
+   /* don't touch RT tasks */
+   return 0;
 }
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)

-- 
-
To unsubscribe from this list: send the line unsubscribe linux-rt-users in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >