RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-07 Thread Chen, Kenneth W
Ingo Molnar wrote on Tuesday, April 05, 2005 11:46 PM
> ok, the delay of 16 secs is alot better. Could you send me the full
> detection log, how stable is the curve?

Full log attached.


begin 666 boot.log
M0F]O="!PF5D($E40R!W:71H($-052 P("AL87-T(&1I9F8@,R!C>6-L97,L(>&5R
M@I#86QI8G)A=
M9R!D96QA>2!L;V]P+BXN(#(R-#$N,#@@0F]G;TU)4%,@*:CTQ,#DS-C,R
M*0I#4%4@,CH@F5D($E40R!W:71H($-052 P("AL87-T(&1I
M9F8@,R!C>6-L97,L(>&5R@I#86QI8G)A=!D96QA>2!L;V]P+BXN(#(R-#$N,#@@0F]G
M;TU)4%,@*:CTQ,#DS-C,R*0I#4%4@,SH@F5D($E40R!W
M:71H($-052 P("AL87-T(&1I9F8@,R!C>6-L97,L(>&5R@I#86QI8G)A=!D96QA>2!L
M;V]P+BXN(#(R-#$N,#@@0F]G;TU)4%,@*:CTQ,#DS-C,R*0I"%]C86-H95]S:7IE.B Y-#,W,[EMAIL 
PROTECTED]"!C<'4Z
M("TQ($U(>BDZ"BTM+2TM+2TM+2TM+2TM+2TM+2TM+0H@(" @(" @(" @6S P
M72 @("!;,#%=(" @(%LP,ET@(" @6S S70I;,#!=.B @(" @+2 @(" @.2XS
M*# I(" Y+C,H,"D@(#DN,[EMAIL PROTECTED];,#%=.B @(#DN,[EMAIL PROTECTED] @(" M(" 
@(" Y
M+C,H,"D@(#DN,[EMAIL PROTECTED];,#)=.B @(#DN,[EMAIL PROTECTED] @.2XS*# I(" 
@("T@(" @
M(#DN,[EMAIL PROTECTED];,#-=.B @(#DN,[EMAIL PROTECTED] @.2XS*# I(" Y+C,H,"D@(" 
@+2 @
M( HM+2TM+2TM+2TM+2TM+2TM+2TM+2TM+2TM+2TM+2TM+0I\(&-A8VAE9FQU
Mhttp://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-07 Thread Chen, Kenneth W
Ingo Molnar wrote on Tuesday, April 05, 2005 11:46 PM
 ok, the delay of 16 secs is alot better. Could you send me the full
 detection log, how stable is the curve?

Full log attached.


begin 666 boot.log
M0F]O=!PF]C97-S;W(@:60@,'@P+S!X8S0Q. I#4%4@,3H@WEN8VAR;VYI
MF5D($E40R!W:71H($-052 P(AL87-T(1I9F8@,R!C6-L97,L(UA5R
MB U-# @8WEC;5S*0I#4%4@,[EMAIL PROTECTED]F5Q/3$Y.2XT-3E-2'HL($E4
M0R!R871I;STQ-2\R+!)5$,@9G)E3TQ-#DU+CDT-$U(@I#86QI8G)A=EN
M9R!D96QA2!L;V]P+BXN(#(R-#$N,#@@0F]G;TU)4%,@*QP:CTQ,#DS-C,R
M*0I#4%4@,CH@WEN8VAR;VYIF5D($E40R!W:71H($-052 P(AL87-T(1I
M9F8@,R!C6-L97,L(UA5RB U-# @8WEC;5S*0I#4%4@,[EMAIL PROTECTED]
MF5Q/3$Y.2XT-3E-2'HL($E40R!R871I;STQ-2\R+!)5$,@9G)E3TQ-#DU
M+CDT-$U(@I#86QI8G)A=EN9R!D96QA2!L;V]P+BXN(#(R-#$N,#@@0F]G
M;TU)4%,@*QP:CTQ,#DS-C,R*0I#4%4@,SH@WEN8VAR;VYIF5D($E40R!W
M:71H($-052 P(AL87-T(1I9F8@,R!C6-L97,L(UA5RB U-# @8WEC
M;5S*0I#4%4@,[EMAIL PROTECTED]F5Q/3$Y.2XT-3E-2'HL($E40R!R871I;STQ
M-2\R+!)5$,@9G)E3TQ-#DU+CDT-$U(@I#86QI8G)A=EN9R!D96QA2!L
M;V]P+BXN(#(R-#$N,#@@0F]G;TU)4%,@*QP:CTQ,#DS-C,R*0IF]U9VAT
M('5P([EMAIL PROTECTED]PI4;W1A;!O9B T('!R;V-EW-OG,@86-T:79A=5D(@X
M.38T+C,R($)O9V]-25!3*2X*0U!5,!A='1A8VAI;F@V-H960M9]M86EN
[EMAIL PROTECTED]@9]M86EN(# Z('[EMAIL PROTECTED]@H@(=R;W5PSH@,2 R([EMAIL 
PROTECTED] I#4%4Q(%T
M=%C:EN9R!S8VAE9UD;VUA:6XZB!D;VUA:6X@,#H@W!A;B!FB @9W)O
M=7!S.B R([EMAIL PROTECTED] QD-053(@871T86-H:6YG('-C:5D+61O;6%I;CH*(1O
M;6%I;B P.B!S%N(8*(!GF]U',Z([EMAIL PROTECTED] Q(#(*0U!5,R!A='1A8VAI
M;F@V-H960M9[EMAIL PROTECTED]@9]M86EN(# Z('[EMAIL PROTECTED]@H@(=R;W5PSH@
M. Q(#(@- I;,T^,5TZ( W,#Y-C,P( W,#Y-C,P( S,S(Y.#4P(#T^
M( @,3 T,#DT.# NELP+3XQ73H@(#P.#$T,C @(#P.#$T,C @(#,S,S$W
M.3$@/3X@( Q,#0Q,S(Q,2X*6S M/C%=.B @-S$W,#8U,R @-S$W,#8U,R @
M,S,T.#$Y.2 ]/B @([EMAIL PROTECTED];,2T^,%TZ( W,#R-#$U( W,#R
M-#$U( S,S,Q.#0T(#T^( @,3 T,#0R-3DNELQ+3XP73H@(#P-S8R,C@@
M(#P-S8R,C@@(#,S,S W,S(@/3X@( Q,#0P-CDV,X*6S$M/C!=.B @-S$V
M-#$Q- @-S$V-#$Q- @,S,T.#$Q,R ]/B @(#$P-3$R,C([EMAIL PROTECTED];,T^,%TZ
M( W,#W,3$W( W,#W,3$W( @.30R,38Q(#T^( @([EMAIL PROTECTED],[EMAIL 
PROTECTED]ELP
M+3XP73H@(#P-S0Y-C(@(#P-S0Y-C(@( Y-#(R,C$@/3X@( @.# Q-S$X
M,RX*6S M/C!=.B @-S$W,#(Y-R @-S$W,#(Y-R @(#DS.34S, ]/B @( X
M,3 Y.#([EMAIL PROTECTED];,2T^,5TZ( W,#V,#4T( W,#V,#4T( @.30R,CDR(#T^
M( @([EMAIL PROTECTED],[EMAIL PROTECTED]ELQ+3XQ73H@(#P-S U,S@(#P-S U,S@( 
Y-#$T
M-S@@/3X@( @.# Q,C Q-2X*6S$M/C%=.B @-S$V,34Y,B @-S$V,34Y,B @
M(#DS.#V. ]/B @( X,3 P,[EMAIL PROTECTED]/B!;,%U;,5U;-#Q.#4Y,ET@( R
[EMAIL PROTECTED] @,BXT72 H,DZ(@@,C0P,CDV-B @,3(P,30X,RD*6S M/C%=.B @
M-S4S,S(P, @-S4S,S(P, @,S4R,C$T-2 ]/B @(#$Q,#4U,[EMAIL PROTECTED];,T^
M,5TZ( W-3,Q-3,V( W-3,Q-3,V( S-3(T,#DV(#T^( @,3$P-34V,S(N
MELP+3XQ73H@(#T-#$Q.#(@(#T-#$Q.#(@(#,U,#,S.3@@/3X@( Q,#DT
M-#4X,X*6S$M/C!=.B @-S4R,SQ,2 @-S4R,SQ,2 @,S4R-S4Y- ]/B @
M(#$Q,#4Q,S [EMAIL PROTECTED];,2T^,%TZ( W-3(V,S0P( W-3(V,S0P( S-3(V-C,W
M(#T^( @,3$P-3(Y-SNELQ+3XP73H@(#T,SDU.3@(#T,SDU.3@(#,U
M,#0S,C8@/3X@( Q,#DT,SDR,RX*6S M/C!=.B @-S4S,[EMAIL PROTECTED],R @-S4S,[EMAIL 
PROTECTED]
M,R @(#DX.#(Q-2 ]/B @( X-3$Y,[EMAIL PROTECTED];,T^,%TZ( W-3,Q-C$U( W
M-3,Q-C$U( @[EMAIL PROTECTED],S(#T^( @([EMAIL PROTECTED],[EMAIL 
PROTECTED]ELP+3XP73H@(#T,SDS
M,C8@(#T,SDS,C8@( Y.3 W,S@@/3X@( @.#0S,# V-X*6S$M/C%=.B @
M-S4R.#4W-R @-S4R.#4W-R @(#DX-SDW,2 ]/B @( [EMAIL PROTECTED];,2T^
M,5TZ( W-3([EMAIL PROTECTED]( W-3([EMAIL PROTECTED]( @[EMAIL 
PROTECTED](#T^( @([EMAIL PROTECTED],3(Q.30N
MELQ+3XQ73H@(#T,S(V-S@(#T,S(V-S@( Y.#DW-CD@/3X@( @.#0R
M,[EMAIL PROTECTED],SA=( @,BXU(%L@(#([EMAIL PROTECTED] I.B H
M(#(U,[EMAIL PROTECTED] @( V-C,T,#,IELP+3XQ73H@(#X-SDQ.#@(#X-SDQ.#@
M(#,W,#DP,C @/3X@( Q,34X.#(P-RX*6S M/C%=.B @[EMAIL PROTECTED] @[EMAIL 
PROTECTED]
M-S8V- @,SP-SDQ,2 ]/B @([EMAIL PROTECTED][EMAIL PROTECTED];,T^,5TZ( 
W.3(T.34Q
M( W.3(T.34Q( [EMAIL PROTECTED],#$U(#T^( @,3$V,3$Y-C8NELQ+3XP73H@(#X
M-S$P-3,@(#X-S$P-3,@(#,W,30P,S(@/3X@( Q,34X-3 X-2X*6S$M/C!=
M.B @[EMAIL PROTECTED],3DP-R @[EMAIL PROTECTED],3DP-R @,SQ-# W,B ]/B @([EMAIL 
PROTECTED][EMAIL PROTECTED];
M,2T^,%TZ( W.3(P-#DY( W.3(P-#DY( S-CDQ.30S(#T^( @,3$V,3(T
M-#(NELP+3XP73H@(#X-S4Y.3@@(#X-S4Y.3@@(#$P-# R-#@@/3X@( @
M.#DQ-C(T-BX*6S M/C!=.B @[EMAIL PROTECTED], @[EMAIL PROTECTED], @,3 T,# T. ]
M/B @( [EMAIL PROTECTED];,T^,%TZ( W.3(Q-S$Q( W.3(Q-S$Q( Q,#0S
M,C S(#T^( @([EMAIL PROTECTED],30NELQ+3XQ73H@(#X-S(R.#@@(#X-S(R.#@@
M(#$P-# P,3(@/3X@( @.#DQ,C,P,X*6S$M/C%=.B @[EMAIL PROTECTED]@U.2 @[EMAIL 
PROTECTED]
[EMAIL PROTECTED] @,3 S.38X,B ]/B @( X.3 [EMAIL PROTECTED];,2T^,5TZ( 
W.3$W,[EMAIL PROTECTED]
M( W.3$W,[EMAIL PROTECTED]( Q,#0R.#8W(#T^( @([EMAIL 
PROTECTED]BT^(%LP75LQ75LU
M,C(X,S4U72 @(#(N-B!;( R+C9=(@[EMAIL PROTECTED] R-C8Q,C8T( @,SDX,[EMAIL 
PROTECTED]
M*0I;,T^,5TZ( X,C8P-CDR( X,C8P-CDR( [EMAIL PROTECTED]W(#T^( @,3(Q
M-#,V-CDNELP+3XQ73H@([EMAIL PROTECTED]X,#0@([EMAIL 
PROTECTED]X,#0@(#,X.#,T,30@/3X@
M( Q,C$T,3(Q.X*6S M/C%=.B @.#0R-CDS,2 @.#0R-CDS,2 @,SDP,S,V
M-R ]/B @(#$R,S,P,[EMAIL PROTECTED];,2T^,%TZ( X,C4Q,C8T( X,C4Q,C8T( S
[EMAIL PROTECTED],#S(#T^( @,3(Q,[EMAIL PROTECTED],SNELQ+3XP73H@([EMAIL 
PROTECTED],S@@([EMAIL PROTECTED]

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-06 Thread Ingo Molnar

* Chen, Kenneth W <[EMAIL PROTECTED]> wrote:

> > tested on x86, the calibration results look ok there.
> 
> Calibration result on ia64 (1.5 GHz, 9 MB), somewhat smaller in this 
> version compare to earlier estimate of 10.4ms.  The optimal setting 
> found by a db workload is around 16 ms.

with idle time in the system i'd first concentrate on getting rid of the 
idle time, and then re-measuring the sweet spot. 9.3 msec is certainly a 
correct ballpark figure.

There will also be workload related artifacts: you may speculatively 
delay migration to another CPU, in the hope of the currently executing 
task scheduling away soon. (I have played with that in the past - the 
scheduler has some idea about how scheduling-happy a task is, based on 
the interactivity estimator.)

The cost matrix on the other hand measures the pure cache-related 
migration cost, on a quiet system. There can be an up to factor 2 
increase in the 'effective migration cost', depending on the average 
length of the scheduling atom of the currently executing task. Further 
increases may happen if the system does not scale and interacting 
migrations slow down each other. So with the 9.3msec estimate, the true 
migration factor might be between 9.3 and 18.6 msecs. The bad news would 
be if the estimator gave a higher value than your sweet spot.

once we have a good estimate of the migration cost between domains 
(ignoring permanent penalties such as NUMA), there's a whole lot of 
things we can do with that, to apply it in a broader sense.

> -
> | migration cost matrix (max_cache_size: 9437184, cpu: -1 MHz):
> -
>   [00][01][02][03]
> [00]: - 9.3(0)  9.3(0)  9.3(0)
> [01]:   9.3(0)- 9.3(0)  9.3(0)
> [02]:   9.3(0)  9.3(0)- 9.3(0)
> [03]:   9.3(0)  9.3(0)  9.3(0)-
> 
> | cacheflush times [1]: 9.3 (9329800)
> | calibration delay: 16 seconds
> 

ok, the delay of 16 secs is alot better. Could you send me the full 
detection log, how stable is the curve?

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-06 Thread Ingo Molnar

* Chen, Kenneth W [EMAIL PROTECTED] wrote:

  tested on x86, the calibration results look ok there.
 
 Calibration result on ia64 (1.5 GHz, 9 MB), somewhat smaller in this 
 version compare to earlier estimate of 10.4ms.  The optimal setting 
 found by a db workload is around 16 ms.

with idle time in the system i'd first concentrate on getting rid of the 
idle time, and then re-measuring the sweet spot. 9.3 msec is certainly a 
correct ballpark figure.

There will also be workload related artifacts: you may speculatively 
delay migration to another CPU, in the hope of the currently executing 
task scheduling away soon. (I have played with that in the past - the 
scheduler has some idea about how scheduling-happy a task is, based on 
the interactivity estimator.)

The cost matrix on the other hand measures the pure cache-related 
migration cost, on a quiet system. There can be an up to factor 2 
increase in the 'effective migration cost', depending on the average 
length of the scheduling atom of the currently executing task. Further 
increases may happen if the system does not scale and interacting 
migrations slow down each other. So with the 9.3msec estimate, the true 
migration factor might be between 9.3 and 18.6 msecs. The bad news would 
be if the estimator gave a higher value than your sweet spot.

once we have a good estimate of the migration cost between domains 
(ignoring permanent penalties such as NUMA), there's a whole lot of 
things we can do with that, to apply it in a broader sense.

 -
 | migration cost matrix (max_cache_size: 9437184, cpu: -1 MHz):
 -
   [00][01][02][03]
 [00]: - 9.3(0)  9.3(0)  9.3(0)
 [01]:   9.3(0)- 9.3(0)  9.3(0)
 [02]:   9.3(0)  9.3(0)- 9.3(0)
 [03]:   9.3(0)  9.3(0)  9.3(0)-
 
 | cacheflush times [1]: 9.3 (9329800)
 | calibration delay: 16 seconds
 

ok, the delay of 16 secs is alot better. Could you send me the full 
detection log, how stable is the curve?

Ingo
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-05 Thread Chen, Kenneth W
Ingo Molnar wrote on Monday, April 04, 2005 8:05 PM
>
> latest patch attached. Changes:
>
>  - stabilized calibration even more, by using cache flushing
>instructions to generate a predictable working set. The cache
>flushing itself is not timed, it is used to create quiescent
>cache  state.
>
>I only guessed the ia64 version - e.g. i didnt know what 'type'
>argument to pass to ia64_sal_cache_flush() to get a d/cache
>flush+invalidate.

It is preferable to use a ia64_pal_cache_flush instead of SAL call. But
it hangs the machine with that pal call.  I will look at it tomorrow.
The type argument for sal_cache_flush is: 1 for icache, 2 for dcache,
and 3 for i+d.


>  - due to more stable results, reduced ITERATIONS from 3 to 2 - this
>should further speed up calibration.
>
> tested on x86, the calibration results look ok there.

Calibration result on ia64 (1.5 GHz, 9 MB), somewhat smaller in this
version compare to earlier estimate of 10.4ms.  The optimal setting
found by a db workload is around 16 ms.

-
| migration cost matrix (max_cache_size: 9437184, cpu: -1 MHz):

-
  [00][01][02][03]
[00]: - 9.3(0)  9.3(0)  9.3(0)
[01]:   9.3(0)- 9.3(0)  9.3(0)
[02]:   9.3(0)  9.3(0)- 9.3(0)
[03]:   9.3(0)  9.3(0)  9.3(0)-

| cacheflush times [1]: 9.3 (9329800)
| calibration delay: 16 seconds



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-05 Thread Chen, Kenneth W
Ingo Molnar wrote on Sunday, April 03, 2005 11:24 PM

> great! How long does the benchmark take (hours?), and is there any way
> to speed up the benchmarking (without hurting accuracy), so that
> multiple migration-cost settings could be tried? Would it be possible to
> try a few other values via the migration_factor boot option, in 0.5 msec
> steps or so, to find the current sweet spot? It used to be at 11 msec
> previously, correct?

It take days, each experiment is 5 hours.  Previous experiments on 2.6.8
shows that the sweet spot was 12.5ms.

This time on 2.6.11, it got pushed into 16 ms.  Results comparing to 10ms:

 8 ms  -0.3%
10 ms  --
12 ms   +0.11%
16 ms +0.14%
20 ms +0.06%

12ms and up all has about 1.5% idle time.  We are not anywhere near the
limits on what the disk storage can deliver.  So there is a potential to
to tune/optimize the scheduler and reap these extra idle time.

- Ken


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-05 Thread Chen, Kenneth W
Ingo Molnar wrote on Sunday, April 03, 2005 11:24 PM

 great! How long does the benchmark take (hours?), and is there any way
 to speed up the benchmarking (without hurting accuracy), so that
 multiple migration-cost settings could be tried? Would it be possible to
 try a few other values via the migration_factor boot option, in 0.5 msec
 steps or so, to find the current sweet spot? It used to be at 11 msec
 previously, correct?

It take days, each experiment is 5 hours.  Previous experiments on 2.6.8
shows that the sweet spot was 12.5ms.

This time on 2.6.11, it got pushed into 16 ms.  Results comparing to 10ms:

 8 ms  -0.3%
10 ms  --
12 ms   +0.11%
16 ms +0.14%
20 ms +0.06%

12ms and up all has about 1.5% idle time.  We are not anywhere near the
limits on what the disk storage can deliver.  So there is a potential to
to tune/optimize the scheduler and reap these extra idle time.

- Ken


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-05 Thread Chen, Kenneth W
Ingo Molnar wrote on Monday, April 04, 2005 8:05 PM

 latest patch attached. Changes:

  - stabilized calibration even more, by using cache flushing
instructions to generate a predictable working set. The cache
flushing itself is not timed, it is used to create quiescent
cache  state.

I only guessed the ia64 version - e.g. i didnt know what 'type'
argument to pass to ia64_sal_cache_flush() to get a d/cache
flush+invalidate.

It is preferable to use a ia64_pal_cache_flush instead of SAL call. But
it hangs the machine with that pal call.  I will look at it tomorrow.
The type argument for sal_cache_flush is: 1 for icache, 2 for dcache,
and 3 for i+d.


  - due to more stable results, reduced ITERATIONS from 3 to 2 - this
should further speed up calibration.

 tested on x86, the calibration results look ok there.

Calibration result on ia64 (1.5 GHz, 9 MB), somewhat smaller in this
version compare to earlier estimate of 10.4ms.  The optimal setting
found by a db workload is around 16 ms.

-
| migration cost matrix (max_cache_size: 9437184, cpu: -1 MHz):

-
  [00][01][02][03]
[00]: - 9.3(0)  9.3(0)  9.3(0)
[01]:   9.3(0)- 9.3(0)  9.3(0)
[02]:   9.3(0)  9.3(0)- 9.3(0)
[03]:   9.3(0)  9.3(0)  9.3(0)-

| cacheflush times [1]: 9.3 (9329800)
| calibration delay: 16 seconds



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

latest patch attached. Changes:

 - stabilized calibration even more, by using cache flushing 
   instructions to generate a predictable working set. The cache 
   flushing itself is not timed, it is used to create quiescent
   cache  state.

   I only guessed the ia64 version - e.g. i didnt know what 'type' 
   argument to pass to ia64_sal_cache_flush() to get a d/cache 
   flush+invalidate. Same for ppc/ppc64 - i only guessed the function
   in question but didnt test it.

 - due to more stable results, reduced ITERATIONS from 3 to 2 - this 
   should further speed up calibration.

tested on x86, the calibration results look ok there.

Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -4640,6 +4641,506 @@ void __devinit init_sched_build_groups(s
 }
 
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer 
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE   2
+#define MIN_CACHE_SIZE (64*1024U)
+#define DEFAULT_CACHE_SIZE (5*1024*1024U)
+#define ITERATIONS 2
+#define SIZE_THRESH130
+#define COST_THRESH130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static __initdata unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+   { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+   int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+   str = get_options(str, ARRAY_SIZE(ints), ints);
+
+   printk("#ints: %d\n", ints[0]);
+   for (i = 1; i <= ints[0]; i++) {
+   migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+   printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
+   }
+   return 1;
+}
+
+__setup ("migration_cost=", migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static __initdata unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+   get_option(, _factor);
+   migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+   return 1;
+}
+
+__setup("migration_factor=", setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+__init static unsigned long domain_distance(int cpu1, int cpu2)
+{
+   unsigned long distance = 0;
+   struct sched_domain *sd;
+
+   for_each_domain(cpu1, sd) {
+   WARN_ON(!cpu_isset(cpu1, sd->span));
+   if (cpu_isset(cpu2, sd->span))
+   return distance;
+   distance++;
+   }
+   if (distance >= MAX_DOMAIN_DISTANCE) {
+   WARN_ON(1);
+   distance = MAX_DOMAIN_DISTANCE-1;
+   }
+
+   return distance;
+}
+
+static __initdata unsigned int migration_debug = 1;
+
+static int __init setup_migration_debug(char *str)
+{
+   get_option(, _debug);
+   return 1;
+}
+
+__setup("migration_debug=", setup_migration_debug);
+
+/*
+ * 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Chen, Kenneth W <[EMAIL PROTECTED]> wrote:

> Perhaps, I'm not getting the latest patch?  It skipped measuring 
> because migration cost array is non-zero (initialized to -1LL):

yeah ... some mixup here. I've attached the latest.

> Also, the cost calculation in measure_one() looks fishy to me in this 
> version.

> > +   t0 = sched_clock();
> > +   touch_cache(cache, size);
> > +   t1 = sched_clock();

> > +   t2 = sched_clock();
> > +   touch_cache(cache, size);
> > +   t3 = sched_clock();

> > +   cost = t2-t1 + t3-t2;
> 
> Typo here ??

yeah - fixed this too in the attached snapshot.

Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -4640,6 +4641,478 @@ void __devinit init_sched_build_groups(s
 }
 
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer 
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE   2
+#define MIN_CACHE_SIZE (64*1024U)
+#define DEFAULT_CACHE_SIZE (5*1024*1024U)
+#define ITERATIONS 3
+#define SIZE_THRESH130
+#define COST_THRESH130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static __initdata unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+   { -1LL , };
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+   int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+   str = get_options(str, ARRAY_SIZE(ints), ints);
+
+   printk("#ints: %d\n", ints[0]);
+   for (i = 1; i <= ints[0]; i++) {
+   migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+   printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
+   }
+   return 1;
+}
+
+__setup ("migration_cost=", migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static __initdata unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+   get_option(, _factor);
+   migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+   return 1;
+}
+
+__setup("migration_factor=", setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+__init static unsigned long domain_distance(int cpu1, int cpu2)
+{
+   unsigned long distance = 0;
+   struct sched_domain *sd;
+
+   for_each_domain(cpu1, sd) {
+   WARN_ON(!cpu_isset(cpu1, sd->span));
+   if (cpu_isset(cpu2, sd->span))
+   return distance;
+   distance++;
+   }
+   if (distance >= MAX_DOMAIN_DISTANCE) {
+   WARN_ON(1);
+   distance = MAX_DOMAIN_DISTANCE-1;
+   }
+
+   return distance;
+}
+
+static __initdata unsigned int migration_debug = 1;
+
+static int __init setup_migration_debug(char *str)
+{
+   get_option(, _debug);
+   return 1;
+}
+
+__setup("migration_debug=", setup_migration_debug);
+
+/*
+ * Maximum cache-size that the scheduler should try to measure.
+ * Architectures 

RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Chen, Kenneth W
* Chen, Kenneth W <[EMAIL PROTECTED]> wrote:
> The cache size information on ia64 is already available at the finger
> tip. Here is a patch that I whipped up to set max_cache_size for ia64.

Ingo Molnar wrote on Monday, April 04, 2005 4:38 AM
> thanks - i've added this to my tree.
>
> i've attached the latest snapshot. There are a number of changes in the
> patch: firstly, i changed the direction of the iteration to go from
> small sizes to larger sizes, and i added a method to detect the maximum.
>
> Furthermore, i tweaked the test some more to make it both faster and
> more reliable, and i cleaned up the code. (e.g. now we migrate via the
> scheduler, not via on_each_cpu().) The default patch should print enough
> debug information as-is.
>
> I changed the workload too so potentially the detected values might be
> off from the ideal value on your box. The values detected on x86 are
> mostly unchanged, relative to previous patches.

Perhaps, I'm not getting the latest patch?  It skipped measuring because
migration cost array is non-zero (initialized to -1LL):

[00][01][02][03]
[00]: - 0.0(0)  0.0(0)  0.0(0)
[01]:   0.0(0)- 0.0(0)  0.0(0)
[02]:   0.0(0)  0.0(0)- 0.0(0)
[03]:   0.0(0)  0.0(0)  0.0(0)-

| cacheflush times [1]: 0.0 (-1)
| calibration delay: 0 seconds



Need this change?  I bet you had that in your tree already.

--- ./kernel/sched.c.orig   2005-04-04 18:01:45.0 -0700
+++ ./kernel/sched.c2005-04-04 18:21:41.0 -0700
@@ -5050,7 +5050,7 @@ void __devinit calibrate_migration_costs
/*
 * Do we have the result cached already?
 */
-   if (migration_cost[distance])
+   if (migration_cost[distance] != -1LL)
cost = migration_cost[distance];
else {
cost = measure_migration_cost(cpu1, cpu2);



Also, the cost calculation in measure_one() looks fishy to me in this version.

> + /*
> +  * Dirty the working set:
> +  */
> + t0 = sched_clock();
> + touch_cache(cache, size);
> + t1 = sched_clock();
> +
> + /*
> +  * Migrate to the target CPU, dirty the L2 cache and access
> +  * the shared buffer. (which represents the working set
> +  * of a migrated task.)
> +  */
> + mask = cpumask_of_cpu(target);
> + set_cpus_allowed(current, mask);
> + WARN_ON(smp_processor_id() != target);
> +
> + t2 = sched_clock();
> + touch_cache(cache, size);
> + t3 = sched_clock();
> +
> + cost = t2-t1 + t3-t2;

Typo here ??


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Paul Jackson
Ingo wrote:
> i've attached the latest snapshot.

I ran your latest snapshot on 64 CPU (well, 62 - one node wasn't
working) system.  I made one change - chop the matrix lines at 8 terms. 
It's a hack - don't know if it's a good idea.  But the long lines were
hard to read (and would only get worse on a 512).  And I had a fear,
probably unfounded, that the long lines could slow things down.

It built and ran fine, exactly as provided, against 2.6.12-rc1-mm4. I
probably have the unchopped matrix output in my screenlog file, if you
want it.  Though, given that the matrix is more or less symmetric, I
wasn't seeing much value in the part I chopped.

It took 24 seconds - a little painful, but booting this system takes
a few minutes, so 24 seconds is not fatal - just painful.

The maximum finding code - to stop scanning after the max has been
passed, works fine.  If it had been (impossibly) perfect, stopping right
at the max, it would have been perhaps 30% faster, so there is not a
huge amount to be gained from trying to fine tune the scan termination
logic.

I can imagine that one could trim this time by doing a couple of scans,
the first one at lower density (perhaps just one out of four sizes
considered), then the second scan at full density, around the maximum
found by the first.  However this would be less robust, and yet more
logic.

Or perhaps, long shot, one could get fancy with some parameterized curve
fitting.  If some equation is a reasonably fit for the function being
sampled here, then just a low density scan through the max could be used
to estimate the co-efficients of whatever the equation was, and the
equation used to find the maximum, instead of the samples.  This would
be fun to play with, but I can't now - other duties are calling.

The one change:

diff -Naurp auto-tune_migration_costs/kernel/sched.c 
auto-tune_migration_costs_chopped/kernel/sched.c
--- auto-tune_migration_costs/kernel/sched.c2005-04-04 09:11:43.0 
-0700
+++ auto-tune_migration_costs_chopped/kernel/sched.c2005-04-04 
09:11:22.0 -0700
@@ -5287,6 +5287,7 @@ void __devinit calibrate_migration_costs
distance = domain_distance(cpu1, cpu2);
max_distance = max(max_distance, distance);
cost = migration_cost[distance];
+   if (cpu2 < 8)
printk(" %2ld.%ld(%ld)", (long)cost / 100,
((long)cost / 10) % 10, distance);
}

With this change, the output was:

Memory: 243350592k/244270096k available (7182k code, 921216k reserved, 3776k 
data, 368k init)
McKinley Errata 9 workaround not needed; disabling it
Dentry cache hash table entries: 33554432 (order: 14, 268435456 bytes)
Inode-cache hash table entries: 16777216 (order: 13, 134217728 bytes)
Mount-cache hash table entries: 1024
Boot processor id 0x0/0x40
Brought up 62 CPUs
Total of 62 processors activated (138340.68 BogoMIPS).
-> [0][2][3145728]  12.3 [ 12.3] (1): (12361880  6180940)
-> [0][2][3311292]  13.1 [ 13.1] (1): (13175591  3497325)
-> [0][2][3485570]  13.7 [ 13.7] (1): (13718647  2020190)
-> [0][2][3669021]  14.3 [ 14.3] (1): (14356800  1329171)
-> [0][2][3862127]  15.5 [ 15.5] (1): (15522156  1247263)
-> [0][2][4065396]  16.4 [ 16.4] (1): (16487934  1106520)
-> [0][2][4279364]  17.3 [ 17.3] (1): (17356154   987370)
-> [0][2][4504593]  18.1 [ 18.1] (1): (18144452   887834)
-> [0][2][4741676]  18.9 [ 18.9] (1): (18934638   839010)
-> [0][2][4991237]  19.9 [ 19.9] (1): (19965884   935128)
-> [0][2][5253933]  21.0 [ 21.0] (1): (21067441  1018342)
-> [0][2][5530455]  22.3 [ 22.3] (1): (22303727  1127314)
-> [0][2][5821531]  23.4 [ 23.4] (1): (23453867  1138727)
-> [0][2][6127927]  23.4 [ 23.4] (1): (23406625   592984)
-> [0][2][6450449]  23.5 [ 23.5] (1): (23586123   386241)
-> [0][2][6789946]  23.5 [ 23.5] (1): (23519823   226270)
-> [0][2][7147311]  22.6 [ 23.5] (1): (22619385   563354)
-> [0][2][7523485]  21.9 [ 23.5] (1): (21998024   592357)
-> [0][2][7919457]  20.7 [ 23.5] (1): (20705771   942305)
-> [0][2][8336270]  17.2 [ 23.5] (1): (17244361  2201857)
-> [0][2][8775021]  14.6 [ 23.5] (1): (14644331  2400943)
-> found max.
[0][2] working set size found: 6450449, cost: 23586123
-> [0][32][3145728]  17.8 [ 17.8] (2): (17848927  8924463)
-> [0][32][3311292]  18.8 [ 18.8] (2): (18811236  4943386)
-> [0][32][3485570]  19.7 [ 19.7] (2): (19779337  2955743)
-> [0][32][3669021]  20.8 [ 20.8] (2): (20811634  1994020)
-> [0][32][3862127]  21.9 [ 21.9] (2): (21919806  1551096)
-> [0][32][4065396]  23.0 [ 23.0] (2): (23075814  1353552)
-> [0][32][4279364]  24.2 [ 24.2] (2): (24267691  1272714)
-> [0][32][4504593]  25.5 [ 25.5] (2): (25546809  1275916)
-> [0][32][4741676]  26.8 [ 26.8] (2): (26886375  1307741)
-> [0][32][4991237]  28.2 [ 28.2] (2): (28291601  1356483)
-> [0][32][5253933]  29.5 [ 29.5] (2): (29587239  1326060)
-> [0][32][5530455]  30.6 [ 30.6] (2): (30669228  1204024)
-> 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Chen, Kenneth W <[EMAIL PROTECTED]> wrote:

> Ingo Molnar wrote on Saturday, April 02, 2005 11:04 PM
> > the default on ia64 (32MB) was way too large and caused the search to
> > start from 64MB. That can take a _long_ time.
> >
> > i've attached a new patch with your changes included, and a couple of
> > new things added:
> >
> >  - removed the 32MB max_cache_size hack from ia64 - it should now fall
> >back to the default 5MB and do a search from 10MB downwards. This
> >should speed up the search.
> 
> The cache size information on ia64 is already available at the finger 
> tip. Here is a patch that I whipped up to set max_cache_size for ia64.

thanks - i've added this to my tree.

i've attached the latest snapshot. There are a number of changes in the 
patch: firstly, i changed the direction of the iteration to go from 
small sizes to larger sizes, and i added a method to detect the maximum.  

Furthermore, i tweaked the test some more to make it both faster and 
more reliable, and i cleaned up the code. (e.g. now we migrate via the 
scheduler, not via on_each_cpu().) The default patch should print enough 
debug information as-is.

I changed the workload too so potentially the detected values might be 
off from the ideal value on your box. The values detected on x86 are 
mostly unchanged, relative to previous patches.

Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -4640,6 +4641,478 @@ void __devinit init_sched_build_groups(s
 }
 
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer 
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE   2
+#define MIN_CACHE_SIZE (64*1024U)
+#define DEFAULT_CACHE_SIZE (5*1024*1024U)
+#define ITERATIONS 3
+#define SIZE_THRESH130
+#define COST_THRESH130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static __initdata unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+   { -1LL , };
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+   int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+   str = get_options(str, ARRAY_SIZE(ints), ints);
+
+   printk("#ints: %d\n", ints[0]);
+   for (i = 1; i <= ints[0]; i++) {
+   migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+   printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
+   }
+   return 1;
+}
+
+__setup ("migration_cost=", migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static __initdata unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+   get_option(, _factor);
+   migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+   return 1;
+}
+
+__setup("migration_factor=", setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+__init static unsigned long domain_distance(int cpu1, int cpu2)
+{
+ 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Paul Jackson
Ingo wrote:
> the problem i mentioned earlier is that there is no other use

Eh ... whatever.  The present seems straight forward enough, with a
simple sched domain tree and your auto-tune migration cost calculation
bolted directly on top of that.

I'd better leave the futures to those more experienced than I.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Paul Jackson
Ingo wrote:
> agreed - i've changed it to domain_distance() in my tree.

Good - cool - thanks.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Paul Jackson <[EMAIL PROTECTED]> wrote:

> Would be a good idea to rename 'cpu_distance()' to something more 
> specific, like 'cpu_dist_ndx()', and reserve the generic name 
> 'cpu_distance()' for later use to return a scaled integer distance, 
> rather like 'node_distance()' does now. [...]

agreed - i've changed it to domain_distance() in my tree.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Paul Jackson <[EMAIL PROTECTED]> wrote:

> Nick wrote:
> > In a sense, the information *is* already there - in node_distance.
> > What I think should be done is probably to use node_distance when
> > calculating costs, ...
> 
> Hmmm ... perhaps I'm confused, but this sure sounds like the alternative
> implementation of cpu_distance using node_distance that I submitted to
> this thread about 16 hours ago.

yes, it's that method.

> [...] It was using this alternative that got me the more varied 
> matrix:
> 
> -
>   [00][01][02][03][04][05][06][07]
> [00]: - 4.0(0) 21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
> [01]:   4.0(0)-21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
> [02]:  21.7(1) 21.7(1)- 4.0(0) 25.3(3) 25.3(3) 25.2(2) 25.2(2)
> [03]:  21.7(1) 21.7(1)  4.0(0)-25.3(3) 25.3(3) 25.2(2) 25.2(2)
> [04]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)- 4.0(0) 21.7(1) 21.7(1)
> [05]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)  4.0(0)-21.7(1) 21.7(1)
> [06]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)- 4.0(0)
> [07]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)  4.0(0)-
> -

the problem i mentioned earlier is that there is no other use for the 
matrix right now than the domain hierarchy. And if there's no place in 
the domain hieararchy to put this info then the information is lost.

so we might be able to _measure_ a rank-3 matrix, but if the domain is 
only rank-2 then we'll have to discard one level of information.

we could try some hybride method of averaging 25.3 with 21.7 and putting 
that into the domain tree, but i'd be against it for the following 
reasons:

firstly, _if_ an extra level in the hierarchy makes a difference, we 
might as well add it to the domain tree - and that may bring other 
advantages (in terms of more finegrained balancing) in addition to 
better migration.

secondly, right now the cost measurement method and calculation is 
rather simple and has minimal assumptions, and i'd like to keep it so as 
long as possible. If an extra domain level gives problems or artifacts 
elsewhere then we should fix those problems if possible, and not 
complicate the cost calculation.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Paul Jackson
Nick wrote:
> In a sense, the information *is* already there - in node_distance.
> What I think should be done is probably to use node_distance when
> calculating costs, ...

Hmmm ... perhaps I'm confused, but this sure sounds like the alternative
implementation of cpu_distance using node_distance that I submitted to
this thread about 16 hours ago.  It was using this alternative that
got me the more varied matrix:

-
  [00][01][02][03][04][05][06][07]
[00]: - 4.0(0) 21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
[01]:   4.0(0)-21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
[02]:  21.7(1) 21.7(1)- 4.0(0) 25.3(3) 25.3(3) 25.2(2) 25.2(2)
[03]:  21.7(1) 21.7(1)  4.0(0)-25.3(3) 25.3(3) 25.2(2) 25.2(2)
[04]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)- 4.0(0) 21.7(1) 21.7(1)
[05]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)  4.0(0)-21.7(1) 21.7(1)
[06]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)- 4.0(0)
[07]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)  4.0(0)-
-

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Ingo Molnar <[EMAIL PROTECTED]> wrote:

> > a numa scheduler domain at the top level and cache_hot_time will be 
> > set to 0 in that case on smp box.  Though this will be a mutt point 
> > with recent patch from Suresh Siddha for removing the extra bogus 
> > scheduler domains.  
> > http://marc.theaimsgroup.com/?t=11124020801=1=2
> 
> at first sight the dummy domain should not be a problem, [...]

at second sight, maybe it could be a problem after all. It's safe is 
load_balance(), where task_hot() should never happen to be called for 
the dummy domain. (because the dummy domain has only one CPU group on 
such boxes)

But if the dummy domain has SD_WAKE_AFFINE set then it's being 
considered for passive migration, and a value of 0 means 'can always 
migrate', and in situations where other domains signalled 'task is too 
hot', this domain may still override the decision (incorrectly). So the 
safe value for dummy domains would a cacheflush time of 'infinity' - to 
make sure migration decisions are only done via other domains.

I've changed this in my tree - migration_cost[] is now initialized to 
-1LL, which should solve this problem.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Chen, Kenneth W <[EMAIL PROTECTED]> wrote:

> Ingo Molnar wrote on Sunday, April 03, 2005 7:30 AM
> > how close are these numbers to the real worst-case migration costs on
> > that box?
> 
> I booted your latest patch on a 4-way SMP box (1.5 GHz, 9MB ia64). This
> is what it produces.  I think the estimate is excellent.
> 
> [00]: -10.4(0) 10.4(0) 10.4(0)
> [01]:  10.4(0)-10.4(0) 10.4(0)
> [02]:  10.4(0) 10.4(0)-10.4(0)
> [03]:  10.4(0) 10.4(0) 10.4(0)-
> -
> cacheflush times [1]: 10.4 (10448800)

great! How long does the benchmark take (hours?), and is there any way 
to speed up the benchmarking (without hurting accuracy), so that 
multiple migration-cost settings could be tried? Would it be possible to 
try a few other values via the migration_factor boot option, in 0.5 msec 
steps or so, to find the current sweet spot? It used to be at 11 msec 
previously, correct? E.g. migration_factor=105 will change the cost to 
10.9 msec, migration_factor=110 will change it to 11.4, etc. Or with the 
latest snapshot you can set absolute values as well, 
migration_cost=11500 sets the cost to 11.5 msec.

> One other minor thing: when booting a numa kernel on smp box, there is 
> a numa scheduler domain at the top level and cache_hot_time will be 
> set to 0 in that case on smp box.  Though this will be a mutt point 
> with recent patch from Suresh Siddha for removing the extra bogus 
> scheduler domains.  
> http://marc.theaimsgroup.com/?t=11124020801=1=2

at first sight the dummy domain should not be a problem, the ->cache_hot 
values are only used when deciding whether a task should migrate to a 
parallel domain or not - if there's an extra highlevel domain instance 
then such decisions are never made, so a zero value makes no difference.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Chen, Kenneth W [EMAIL PROTECTED] wrote:

 Ingo Molnar wrote on Sunday, April 03, 2005 7:30 AM
  how close are these numbers to the real worst-case migration costs on
  that box?
 
 I booted your latest patch on a 4-way SMP box (1.5 GHz, 9MB ia64). This
 is what it produces.  I think the estimate is excellent.
 
 [00]: -10.4(0) 10.4(0) 10.4(0)
 [01]:  10.4(0)-10.4(0) 10.4(0)
 [02]:  10.4(0) 10.4(0)-10.4(0)
 [03]:  10.4(0) 10.4(0) 10.4(0)-
 -
 cacheflush times [1]: 10.4 (10448800)

great! How long does the benchmark take (hours?), and is there any way 
to speed up the benchmarking (without hurting accuracy), so that 
multiple migration-cost settings could be tried? Would it be possible to 
try a few other values via the migration_factor boot option, in 0.5 msec 
steps or so, to find the current sweet spot? It used to be at 11 msec 
previously, correct? E.g. migration_factor=105 will change the cost to 
10.9 msec, migration_factor=110 will change it to 11.4, etc. Or with the 
latest snapshot you can set absolute values as well, 
migration_cost=11500 sets the cost to 11.5 msec.

 One other minor thing: when booting a numa kernel on smp box, there is 
 a numa scheduler domain at the top level and cache_hot_time will be 
 set to 0 in that case on smp box.  Though this will be a mutt point 
 with recent patch from Suresh Siddha for removing the extra bogus 
 scheduler domains.  
 http://marc.theaimsgroup.com/?t=11124020801r=1w=2

at first sight the dummy domain should not be a problem, the -cache_hot 
values are only used when deciding whether a task should migrate to a 
parallel domain or not - if there's an extra highlevel domain instance 
then such decisions are never made, so a zero value makes no difference.

Ingo
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Paul Jackson
Nick wrote:
 In a sense, the information *is* already there - in node_distance.
 What I think should be done is probably to use node_distance when
 calculating costs, ...

Hmmm ... perhaps I'm confused, but this sure sounds like the alternative
implementation of cpu_distance using node_distance that I submitted to
this thread about 16 hours ago.  It was using this alternative that
got me the more varied matrix:

-
  [00][01][02][03][04][05][06][07]
[00]: - 4.0(0) 21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
[01]:   4.0(0)-21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
[02]:  21.7(1) 21.7(1)- 4.0(0) 25.3(3) 25.3(3) 25.2(2) 25.2(2)
[03]:  21.7(1) 21.7(1)  4.0(0)-25.3(3) 25.3(3) 25.2(2) 25.2(2)
[04]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)- 4.0(0) 21.7(1) 21.7(1)
[05]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)  4.0(0)-21.7(1) 21.7(1)
[06]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)- 4.0(0)
[07]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)  4.0(0)-
-

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Ingo Molnar [EMAIL PROTECTED] wrote:

  a numa scheduler domain at the top level and cache_hot_time will be 
  set to 0 in that case on smp box.  Though this will be a mutt point 
  with recent patch from Suresh Siddha for removing the extra bogus 
  scheduler domains.  
  http://marc.theaimsgroup.com/?t=11124020801r=1w=2
 
 at first sight the dummy domain should not be a problem, [...]

at second sight, maybe it could be a problem after all. It's safe is 
load_balance(), where task_hot() should never happen to be called for 
the dummy domain. (because the dummy domain has only one CPU group on 
such boxes)

But if the dummy domain has SD_WAKE_AFFINE set then it's being 
considered for passive migration, and a value of 0 means 'can always 
migrate', and in situations where other domains signalled 'task is too 
hot', this domain may still override the decision (incorrectly). So the 
safe value for dummy domains would a cacheflush time of 'infinity' - to 
make sure migration decisions are only done via other domains.

I've changed this in my tree - migration_cost[] is now initialized to 
-1LL, which should solve this problem.

Ingo
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Paul Jackson [EMAIL PROTECTED] wrote:

 Nick wrote:
  In a sense, the information *is* already there - in node_distance.
  What I think should be done is probably to use node_distance when
  calculating costs, ...
 
 Hmmm ... perhaps I'm confused, but this sure sounds like the alternative
 implementation of cpu_distance using node_distance that I submitted to
 this thread about 16 hours ago.

yes, it's that method.

 [...] It was using this alternative that got me the more varied 
 matrix:
 
 -
   [00][01][02][03][04][05][06][07]
 [00]: - 4.0(0) 21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
 [01]:   4.0(0)-21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
 [02]:  21.7(1) 21.7(1)- 4.0(0) 25.3(3) 25.3(3) 25.2(2) 25.2(2)
 [03]:  21.7(1) 21.7(1)  4.0(0)-25.3(3) 25.3(3) 25.2(2) 25.2(2)
 [04]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)- 4.0(0) 21.7(1) 21.7(1)
 [05]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)  4.0(0)-21.7(1) 21.7(1)
 [06]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)- 4.0(0)
 [07]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)  4.0(0)-
 -

the problem i mentioned earlier is that there is no other use for the 
matrix right now than the domain hierarchy. And if there's no place in 
the domain hieararchy to put this info then the information is lost.

so we might be able to _measure_ a rank-3 matrix, but if the domain is 
only rank-2 then we'll have to discard one level of information.

we could try some hybride method of averaging 25.3 with 21.7 and putting 
that into the domain tree, but i'd be against it for the following 
reasons:

firstly, _if_ an extra level in the hierarchy makes a difference, we 
might as well add it to the domain tree - and that may bring other 
advantages (in terms of more finegrained balancing) in addition to 
better migration.

secondly, right now the cost measurement method and calculation is 
rather simple and has minimal assumptions, and i'd like to keep it so as 
long as possible. If an extra domain level gives problems or artifacts 
elsewhere then we should fix those problems if possible, and not 
complicate the cost calculation.

Ingo
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Paul Jackson [EMAIL PROTECTED] wrote:

 Would be a good idea to rename 'cpu_distance()' to something more 
 specific, like 'cpu_dist_ndx()', and reserve the generic name 
 'cpu_distance()' for later use to return a scaled integer distance, 
 rather like 'node_distance()' does now. [...]

agreed - i've changed it to domain_distance() in my tree.

Ingo
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Paul Jackson
Ingo wrote:
 the problem i mentioned earlier is that there is no other use

Eh ... whatever.  The present seems straight forward enough, with a
simple sched domain tree and your auto-tune migration cost calculation
bolted directly on top of that.

I'd better leave the futures to those more experienced than I.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Chen, Kenneth W [EMAIL PROTECTED] wrote:

 Ingo Molnar wrote on Saturday, April 02, 2005 11:04 PM
  the default on ia64 (32MB) was way too large and caused the search to
  start from 64MB. That can take a _long_ time.
 
  i've attached a new patch with your changes included, and a couple of
  new things added:
 
   - removed the 32MB max_cache_size hack from ia64 - it should now fall
 back to the default 5MB and do a search from 10MB downwards. This
 should speed up the search.
 
 The cache size information on ia64 is already available at the finger 
 tip. Here is a patch that I whipped up to set max_cache_size for ia64.

thanks - i've added this to my tree.

i've attached the latest snapshot. There are a number of changes in the 
patch: firstly, i changed the direction of the iteration to go from 
small sizes to larger sizes, and i added a method to detect the maximum.  

Furthermore, i tweaked the test some more to make it both faster and 
more reliable, and i cleaned up the code. (e.g. now we migrate via the 
scheduler, not via on_each_cpu().) The default patch should print enough 
debug information as-is.

I changed the workload too so potentially the detected values might be 
off from the ideal value on your box. The values detected on x86 are 
mostly unchanged, relative to previous patches.

Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -47,6 +47,7 @@
 #include linux/syscalls.h
 #include linux/times.h
 #include linux/acct.h
+#include linux/vmalloc.h
 #include asm/tlb.h
 
 #include asm/unistd.h
@@ -4640,6 +4641,478 @@ void __devinit init_sched_build_groups(s
 }
 
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer 
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE   2
+#define MIN_CACHE_SIZE (64*1024U)
+#define DEFAULT_CACHE_SIZE (5*1024*1024U)
+#define ITERATIONS 3
+#define SIZE_THRESH130
+#define COST_THRESH130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static __initdata unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+   { -1LL , };
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+   int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+   str = get_options(str, ARRAY_SIZE(ints), ints);
+
+   printk(#ints: %d\n, ints[0]);
+   for (i = 1; i = ints[0]; i++) {
+   migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+   printk(migration_cost[%d]: %Ld\n, i-1, migration_cost[i-1]);
+   }
+   return 1;
+}
+
+__setup (migration_cost=, migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static __initdata unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+   get_option(str, migration_factor);
+   migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+   return 1;
+}
+
+__setup(migration_factor=, setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+__init static 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Paul Jackson
Ingo wrote:
 i've attached the latest snapshot.

I ran your latest snapshot on 64 CPU (well, 62 - one node wasn't
working) system.  I made one change - chop the matrix lines at 8 terms. 
It's a hack - don't know if it's a good idea.  But the long lines were
hard to read (and would only get worse on a 512).  And I had a fear,
probably unfounded, that the long lines could slow things down.

It built and ran fine, exactly as provided, against 2.6.12-rc1-mm4. I
probably have the unchopped matrix output in my screenlog file, if you
want it.  Though, given that the matrix is more or less symmetric, I
wasn't seeing much value in the part I chopped.

It took 24 seconds - a little painful, but booting this system takes
a few minutes, so 24 seconds is not fatal - just painful.

The maximum finding code - to stop scanning after the max has been
passed, works fine.  If it had been (impossibly) perfect, stopping right
at the max, it would have been perhaps 30% faster, so there is not a
huge amount to be gained from trying to fine tune the scan termination
logic.

I can imagine that one could trim this time by doing a couple of scans,
the first one at lower density (perhaps just one out of four sizes
considered), then the second scan at full density, around the maximum
found by the first.  However this would be less robust, and yet more
logic.

Or perhaps, long shot, one could get fancy with some parameterized curve
fitting.  If some equation is a reasonably fit for the function being
sampled here, then just a low density scan through the max could be used
to estimate the co-efficients of whatever the equation was, and the
equation used to find the maximum, instead of the samples.  This would
be fun to play with, but I can't now - other duties are calling.

The one change:

diff -Naurp auto-tune_migration_costs/kernel/sched.c 
auto-tune_migration_costs_chopped/kernel/sched.c
--- auto-tune_migration_costs/kernel/sched.c2005-04-04 09:11:43.0 
-0700
+++ auto-tune_migration_costs_chopped/kernel/sched.c2005-04-04 
09:11:22.0 -0700
@@ -5287,6 +5287,7 @@ void __devinit calibrate_migration_costs
distance = domain_distance(cpu1, cpu2);
max_distance = max(max_distance, distance);
cost = migration_cost[distance];
+   if (cpu2  8)
printk( %2ld.%ld(%ld), (long)cost / 100,
((long)cost / 10) % 10, distance);
}

With this change, the output was:

Memory: 243350592k/244270096k available (7182k code, 921216k reserved, 3776k 
data, 368k init)
McKinley Errata 9 workaround not needed; disabling it
Dentry cache hash table entries: 33554432 (order: 14, 268435456 bytes)
Inode-cache hash table entries: 16777216 (order: 13, 134217728 bytes)
Mount-cache hash table entries: 1024
Boot processor id 0x0/0x40
Brought up 62 CPUs
Total of 62 processors activated (138340.68 BogoMIPS).
- [0][2][3145728]  12.3 [ 12.3] (1): (12361880  6180940)
- [0][2][3311292]  13.1 [ 13.1] (1): (13175591  3497325)
- [0][2][3485570]  13.7 [ 13.7] (1): (13718647  2020190)
- [0][2][3669021]  14.3 [ 14.3] (1): (14356800  1329171)
- [0][2][3862127]  15.5 [ 15.5] (1): (15522156  1247263)
- [0][2][4065396]  16.4 [ 16.4] (1): (16487934  1106520)
- [0][2][4279364]  17.3 [ 17.3] (1): (17356154   987370)
- [0][2][4504593]  18.1 [ 18.1] (1): (18144452   887834)
- [0][2][4741676]  18.9 [ 18.9] (1): (18934638   839010)
- [0][2][4991237]  19.9 [ 19.9] (1): (19965884   935128)
- [0][2][5253933]  21.0 [ 21.0] (1): (21067441  1018342)
- [0][2][5530455]  22.3 [ 22.3] (1): (22303727  1127314)
- [0][2][5821531]  23.4 [ 23.4] (1): (23453867  1138727)
- [0][2][6127927]  23.4 [ 23.4] (1): (23406625   592984)
- [0][2][6450449]  23.5 [ 23.5] (1): (23586123   386241)
- [0][2][6789946]  23.5 [ 23.5] (1): (23519823   226270)
- [0][2][7147311]  22.6 [ 23.5] (1): (22619385   563354)
- [0][2][7523485]  21.9 [ 23.5] (1): (21998024   592357)
- [0][2][7919457]  20.7 [ 23.5] (1): (20705771   942305)
- [0][2][8336270]  17.2 [ 23.5] (1): (17244361  2201857)
- [0][2][8775021]  14.6 [ 23.5] (1): (14644331  2400943)
- found max.
[0][2] working set size found: 6450449, cost: 23586123
- [0][32][3145728]  17.8 [ 17.8] (2): (17848927  8924463)
- [0][32][3311292]  18.8 [ 18.8] (2): (18811236  4943386)
- [0][32][3485570]  19.7 [ 19.7] (2): (19779337  2955743)
- [0][32][3669021]  20.8 [ 20.8] (2): (20811634  1994020)
- [0][32][3862127]  21.9 [ 21.9] (2): (21919806  1551096)
- [0][32][4065396]  23.0 [ 23.0] (2): (23075814  1353552)
- [0][32][4279364]  24.2 [ 24.2] (2): (24267691  1272714)
- [0][32][4504593]  25.5 [ 25.5] (2): (25546809  1275916)
- [0][32][4741676]  26.8 [ 26.8] (2): (26886375  1307741)
- [0][32][4991237]  28.2 [ 28.2] (2): (28291601  1356483)
- [0][32][5253933]  29.5 [ 29.5] (2): (29587239  1326060)
- [0][32][5530455]  30.6 [ 30.6] (2): (30669228  1204024)
- [0][32][5821531]  30.9 [ 30.9] (2): (30969069   

RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Chen, Kenneth W
* Chen, Kenneth W [EMAIL PROTECTED] wrote:
 The cache size information on ia64 is already available at the finger
 tip. Here is a patch that I whipped up to set max_cache_size for ia64.

Ingo Molnar wrote on Monday, April 04, 2005 4:38 AM
 thanks - i've added this to my tree.

 i've attached the latest snapshot. There are a number of changes in the
 patch: firstly, i changed the direction of the iteration to go from
 small sizes to larger sizes, and i added a method to detect the maximum.

 Furthermore, i tweaked the test some more to make it both faster and
 more reliable, and i cleaned up the code. (e.g. now we migrate via the
 scheduler, not via on_each_cpu().) The default patch should print enough
 debug information as-is.

 I changed the workload too so potentially the detected values might be
 off from the ideal value on your box. The values detected on x86 are
 mostly unchanged, relative to previous patches.

Perhaps, I'm not getting the latest patch?  It skipped measuring because
migration cost array is non-zero (initialized to -1LL):

[00][01][02][03]
[00]: - 0.0(0)  0.0(0)  0.0(0)
[01]:   0.0(0)- 0.0(0)  0.0(0)
[02]:   0.0(0)  0.0(0)- 0.0(0)
[03]:   0.0(0)  0.0(0)  0.0(0)-

| cacheflush times [1]: 0.0 (-1)
| calibration delay: 0 seconds



Need this change?  I bet you had that in your tree already.

--- ./kernel/sched.c.orig   2005-04-04 18:01:45.0 -0700
+++ ./kernel/sched.c2005-04-04 18:21:41.0 -0700
@@ -5050,7 +5050,7 @@ void __devinit calibrate_migration_costs
/*
 * Do we have the result cached already?
 */
-   if (migration_cost[distance])
+   if (migration_cost[distance] != -1LL)
cost = migration_cost[distance];
else {
cost = measure_migration_cost(cpu1, cpu2);



Also, the cost calculation in measure_one() looks fishy to me in this version.

 + /*
 +  * Dirty the working set:
 +  */
 + t0 = sched_clock();
 + touch_cache(cache, size);
 + t1 = sched_clock();
 +
 + /*
 +  * Migrate to the target CPU, dirty the L2 cache and access
 +  * the shared buffer. (which represents the working set
 +  * of a migrated task.)
 +  */
 + mask = cpumask_of_cpu(target);
 + set_cpus_allowed(current, mask);
 + WARN_ON(smp_processor_id() != target);
 +
 + t2 = sched_clock();
 + touch_cache(cache, size);
 + t3 = sched_clock();
 +
 + cost = t2-t1 + t3-t2;

Typo here ??


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

* Chen, Kenneth W [EMAIL PROTECTED] wrote:

 Perhaps, I'm not getting the latest patch?  It skipped measuring 
 because migration cost array is non-zero (initialized to -1LL):

yeah ... some mixup here. I've attached the latest.

 Also, the cost calculation in measure_one() looks fishy to me in this 
 version.

  +   t0 = sched_clock();
  +   touch_cache(cache, size);
  +   t1 = sched_clock();

  +   t2 = sched_clock();
  +   touch_cache(cache, size);
  +   t3 = sched_clock();

  +   cost = t2-t1 + t3-t2;
 
 Typo here ??

yeah - fixed this too in the attached snapshot.

Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -47,6 +47,7 @@
 #include linux/syscalls.h
 #include linux/times.h
 #include linux/acct.h
+#include linux/vmalloc.h
 #include asm/tlb.h
 
 #include asm/unistd.h
@@ -4640,6 +4641,478 @@ void __devinit init_sched_build_groups(s
 }
 
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer 
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE   2
+#define MIN_CACHE_SIZE (64*1024U)
+#define DEFAULT_CACHE_SIZE (5*1024*1024U)
+#define ITERATIONS 3
+#define SIZE_THRESH130
+#define COST_THRESH130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static __initdata unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+   { -1LL , };
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+   int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+   str = get_options(str, ARRAY_SIZE(ints), ints);
+
+   printk(#ints: %d\n, ints[0]);
+   for (i = 1; i = ints[0]; i++) {
+   migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+   printk(migration_cost[%d]: %Ld\n, i-1, migration_cost[i-1]);
+   }
+   return 1;
+}
+
+__setup (migration_cost=, migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static __initdata unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+   get_option(str, migration_factor);
+   migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+   return 1;
+}
+
+__setup(migration_factor=, setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+__init static unsigned long domain_distance(int cpu1, int cpu2)
+{
+   unsigned long distance = 0;
+   struct sched_domain *sd;
+
+   for_each_domain(cpu1, sd) {
+   WARN_ON(!cpu_isset(cpu1, sd-span));
+   if (cpu_isset(cpu2, sd-span))
+   return distance;
+   distance++;
+   }
+   if (distance = MAX_DOMAIN_DISTANCE) {
+   WARN_ON(1);
+   distance = MAX_DOMAIN_DISTANCE-1;
+   }
+
+   return distance;
+}
+
+static __initdata unsigned int migration_debug = 1;
+
+static int __init setup_migration_debug(char *str)
+{
+   get_option(str, migration_debug);
+   return 1;
+}
+
+__setup(migration_debug=, setup_migration_debug);
+
+/*
+ * Maximum 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-04 Thread Ingo Molnar

latest patch attached. Changes:

 - stabilized calibration even more, by using cache flushing 
   instructions to generate a predictable working set. The cache 
   flushing itself is not timed, it is used to create quiescent
   cache  state.

   I only guessed the ia64 version - e.g. i didnt know what 'type' 
   argument to pass to ia64_sal_cache_flush() to get a d/cache 
   flush+invalidate. Same for ppc/ppc64 - i only guessed the function
   in question but didnt test it.

 - due to more stable results, reduced ITERATIONS from 3 to 2 - this 
   should further speed up calibration.

tested on x86, the calibration results look ok there.

Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -47,6 +47,7 @@
 #include linux/syscalls.h
 #include linux/times.h
 #include linux/acct.h
+#include linux/vmalloc.h
 #include asm/tlb.h
 
 #include asm/unistd.h
@@ -4640,6 +4641,506 @@ void __devinit init_sched_build_groups(s
 }
 
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer 
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE   2
+#define MIN_CACHE_SIZE (64*1024U)
+#define DEFAULT_CACHE_SIZE (5*1024*1024U)
+#define ITERATIONS 2
+#define SIZE_THRESH130
+#define COST_THRESH130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static __initdata unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+   { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+   int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+   str = get_options(str, ARRAY_SIZE(ints), ints);
+
+   printk(#ints: %d\n, ints[0]);
+   for (i = 1; i = ints[0]; i++) {
+   migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+   printk(migration_cost[%d]: %Ld\n, i-1, migration_cost[i-1]);
+   }
+   return 1;
+}
+
+__setup (migration_cost=, migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static __initdata unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+   get_option(str, migration_factor);
+   migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+   return 1;
+}
+
+__setup(migration_factor=, setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+__init static unsigned long domain_distance(int cpu1, int cpu2)
+{
+   unsigned long distance = 0;
+   struct sched_domain *sd;
+
+   for_each_domain(cpu1, sd) {
+   WARN_ON(!cpu_isset(cpu1, sd-span));
+   if (cpu_isset(cpu2, sd-span))
+   return distance;
+   distance++;
+   }
+   if (distance = MAX_DOMAIN_DISTANCE) {
+   WARN_ON(1);
+   distance = MAX_DOMAIN_DISTANCE-1;
+   }
+
+   return distance;
+}
+
+static __initdata unsigned int migration_debug = 1;
+
+static int __init setup_migration_debug(char *str)
+{
+   get_option(str, 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Nick Piggin
On Sun, 2005-04-03 at 20:55 -0700, Paul Jackson wrote:

> But if we knew the CPU hierarchy in more detail, and if we had some
> other use for that detail (we don't that I know), then I take it from
> your comment that we should be reluctant to push those details into the
> sched domains.  Put them someplace else if we need them.
> 

In a sense, the information *is* already there - in node_distance.
What I think should be done is probably to use node_distance when
calculating costs, and correlate that with sched-domains as best
we can.

I've got an idea of how to do it, but I'll wait until Ingo gets the
fundamentals working wel before I have a look.

> 
> One question - how serious do you view difference in migration cost
> between say 21.7 and 25.3, two of the cacheflush times I reported on a
> small SN2?
> 
> I'm guessing that this is probably below the noise threshold, at least
> as far as scheduler domains, schedulers and migration care, unless and
> until some persuasive measurements show a situation in which it matters.
> 

Yes, likely below noise. There is an issue with a behavioural
transition point in the wakeup code where you might see good
behaviour with 21 and bad with 25, or vice versa on some workloads.
This is fixed in the scheduler patches coming through -mm though.

But I wasn't worried so much about the absolute value not being
right, rather it maybe not being deterministic. So maybe depending
on what CPU gets assigned what cpuid, you might get different
values on identical machines.

> As you say - not an exact science.
> 




-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Ingo wrote:
> There's no other place to push them 

One could make a place, if the need arose.

> but trying and benchmarking it is necessary to tell for sure.

Hard to argue with that ... ;).

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Ingo Molnar

* Paul Jackson <[EMAIL PROTECTED]> wrote:

> Ingo, if I understood correctly, suggested pushing any necessary 
> detail of the CPU hierarchy into the scheduler domains, so that his 
> latest work tuning migration costs could pick it up from there.
> 
> It makes good sense for the migration cost estimation to be based on 
> whatever CPU hierarchy is visible in the sched domains.
> 
> But if we knew the CPU hierarchy in more detail, and if we had some 
> other use for that detail (we don't that I know), then I take it from 
> your comment that we should be reluctant to push those details into 
> the sched domains.  Put them someplace else if we need them.

There's no other place to push them - most of the hierarchy related 
decisions are done based on the domain tree. So the decision to make is: 
"is it worth complicating the domain tree, in exchange for more accurate 
handling of the real hierarchy?".

In general, the pros are potentially more accuracy and thus higher 
application performance, the cons are overhead (more tree walking) and 
artifacts (the sched-domains logic is good but not perfect, and even if 
there were no bugs in it, the decisions are approximations. One more 
domain level might make things worse.)

but trying and benchmarking it is necessary to tell for sure.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Andy wrote:
> Not that I really know what I'm talking about here, but this sounds 
> highly parallelizable.

I doubt it.  If we are testing the cost of a migration between CPUs
alpha and beta, and at the same time testing betweeen CPUs gamma and
delta, then often there will be some hardware that is shared by both the
 path, and the  path.  This would affect the
test results.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Andy Lutomirski
Paul Jackson wrote:
Ok - that flies, or at least walks.  It took 53 seconds to
compute this cost matrix.
Not that I really know what I'm talking about here, but this sounds 
highly parallelizable.  It seems like you could do N/2 measurements at a 
time, so this should be O(N) to compute the matrix (ignoring issues of 
how long it takes to write the data to memory, but that should be 
insignificant).

Even if you can't parallelize it all the way, it ought to at least help.
--Andy
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Paul wrote:
> I should push in the direction of improving the
> SN2 sched domain hierarchy.

Nick wrote:
> I'd just be a bit careful about this.

Good point - thanks.

I will - be careful.  I have no delusions that I know what would be an
"improvement" to the scheduler - if anything.

Ingo, if I understood correctly, suggested pushing any necessary detail
of the CPU hierarchy into the scheduler domains, so that his latest work
tuning migration costs could pick it up from there.

It makes good sense for the migration cost estimation to be based on
whatever CPU hierarchy is visible in the sched domains.

But if we knew the CPU hierarchy in more detail, and if we had some
other use for that detail (we don't that I know), then I take it from
your comment that we should be reluctant to push those details into the
sched domains.  Put them someplace else if we need them.


One question - how serious do you view difference in migration cost
between say 21.7 and 25.3, two of the cacheflush times I reported on a
small SN2?

I'm guessing that this is probably below the noise threshold, at least
as far as scheduler domains, schedulers and migration care, unless and
until some persuasive measurements show a situation in which it matters.

As you say - not an exact science.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Nick Piggin
Paul Jackson wrote:
Ingo wrote:
if you create a sched-domains hierarchy (based on the SLIT tables, or in 
whatever other way) that matches the CPU hierarchy then you'll 
automatically get the proper distances detected.

Yes - agreed.  I should push in the direction of improving the
SN2 sched domain hierarchy.
I'd just be a bit careful about this. Your biggest systems will have
what? At least 7 or 8 domains if you're just going by the number of
hops, right? And maybe more if there is more to your topology than
just number of hops.
sched-domains firstly has a few problems even with your 2 level NUMA
domains (although I'm looking at fixing them if possible), but also
everything just has to do more work as you traverse the domains and
scan all CPUs for balancing opportunities. And its not like the cpu
scheduler uses any sort of exact science to make choices...
Nick
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Chen, Kenneth W
Ingo Molnar wrote on Sunday, April 03, 2005 7:30 AM
> how close are these numbers to the real worst-case migration costs on
> that box?

I booted your latest patch on a 4-way SMP box (1.5 GHz, 9MB ia64). This
is what it produces.  I think the estimate is excellent.

[00]: -10.4(0) 10.4(0) 10.4(0)
[01]:  10.4(0)-10.4(0) 10.4(0)
[02]:  10.4(0) 10.4(0)-10.4(0)
[03]:  10.4(0) 10.4(0) 10.4(0)-
-
cacheflush times [1]: 10.4 (10448800)


One other minor thing: when booting a numa kernel on smp box, there is
a numa scheduler domain at the top level and cache_hot_time will be set
to 0 in that case on smp box.  Though this will be a mutt point with
recent patch from Suresh Siddha for removing the extra bogus scheduler
domains.  http://marc.theaimsgroup.com/?t=11124020801=1=2


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Chen, Kenneth W
Ingo Molnar wrote on Saturday, April 02, 2005 11:04 PM
> the default on ia64 (32MB) was way too large and caused the search to
> start from 64MB. That can take a _long_ time.
>
> i've attached a new patch with your changes included, and a couple of
> new things added:
>
>  - removed the 32MB max_cache_size hack from ia64 - it should now fall
>back to the default 5MB and do a search from 10MB downwards. This
>should speed up the search.

The cache size information on ia64 is already available at the finger tip.
Here is a patch that I whipped up to set max_cache_size for ia64.


--- linux-2.6.12-rc1/arch/ia64/kernel/setup.c.orig  2005-04-03 
17:14:40.0 -0700
+++ linux-2.6.12-rc1/arch/ia64/kernel/setup.c   2005-04-03 17:55:46.0 
-0700
@@ -561,6 +561,7 @@ static void
 get_max_cacheline_size (void)
 {
unsigned long line_size, max = 1;
+   unsigned int cache_size = 0;
u64 l, levels, unique_caches;
 pal_cache_config_info_t cci;
 s64 status;
@@ -585,8 +586,11 @@ get_max_cacheline_size (void)
line_size = 1 << cci.pcci_line_size;
if (line_size > max)
max = line_size;
+   if (cache_size < cci.pcci_cache_size)
+   cache_size = cci.pcci_cache_size;
 }
   out:
+   max_cache_size = max(max_cache_size, cache_size);
if (max > ia64_max_cacheline_size)
ia64_max_cacheline_size = max;
 }



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Ingo wrote:
> how close are these numbers to the real worst-case migration costs on 
> that box? What are the cache sizes and what is their hierarchies?
>  ...
> is there any workload that shows the same scheduling related performance 
> regressions, other than Ken's $1m+ benchmark kit?

I'll have to talk to some people Monday and get back to you.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Ingo wrote:
> if you create a sched-domains hierarchy (based on the SLIT tables, or in 
> whatever other way) that matches the CPU hierarchy then you'll 
> automatically get the proper distances detected.

Yes - agreed.  I should push in the direction of improving the
SN2 sched domain hierarchy.


Would be a good idea to rename 'cpu_distance()' to something more
specific, like 'cpu_dist_ndx()', and reserve the generic name
'cpu_distance()' for later use to return a scaled integer distance,
rather like 'node_distance()' does now.  For example, 'cpu_distance()'
might, someday, return integer values such as:

40  217  252  253

as are displayed (in tenths) in the debug line:

-
cacheflush times [4]: 4.0 (4080540) 21.7 (21781380) 25.2 (25259428) 25.3 
(25372682)
-

(that is, the integer (long)cost / 10 - one less zero).

I don't know that we have any use, yet, for this 'cpu_distance()' as a
scaled integer value.  But I'd be more comfortable reserving that name
for that purpose.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Ingo wrote:
> if_ there is a significant hierarchy between CPUs it
> should be represented via a matching sched-domains hierarchy,

Agreed.

I'll see how the sched domains hierarchy looks on a bigger SN2 systems.

If the CPU hierarchy is not reflected in the sched-domain hierarchy any
better there, then I will look to involve the "SN2 sched domain
hierarchy experts" in improving SN2 the sched-domain hierarchy.

Ok - that works.  Your patch of yesterday provides just the tool
I need to measure this.  Cool.

> i'll first try the bottom-up approach to speed up detection (getting to
> the hump is very fast most of the time).

Good.

> then we can let the arch override the cpu_distance() method

I'm not aware we need that, yet anyway.  First I should see if
the SN2 sched_domains need improving.  Take a shot at doing it
'the right way' before we go inventing overrides.  I suspect
you agree.

> the migration cost matrix we can later use to tune all the other 
> sched-domains balancing related tunables as well

That comes close to my actual motivation here.  I hope to expose a
"cpu_distance" such as based on this cost matrix, to userland.

We already expose the SLIT table node distances (using SN2 specific
/proc files today, others are working on an arch-neutral mechanism).

As we push more cores and hyperthreads into a single package on one end,
and more complex numa topologies on the other end, this becomes
increasingly interesting to NUMA aware user software.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Ingo Molnar

* Paul Jackson <[EMAIL PROTECTED]> wrote:

> 
>  3) I was noticing that my test system was only showing a couple of 
> distinct values for cpu_distance, even though it has 4 distinct 
> distances for values of node_distance.  So I coded up a variant of 
> cpu_distance that converts the problem to a node_distance problem, 
> and got the following cost matrix:

> The code (below) is twice as complicated, the runtime twice as long,
> and it's less intuitive - sched_domains seems more appropriate as
> the basis for migration costs than the node distances in SLIT tables.
> Finally, I don't know if distinguishing between costs of 21.7 and
> 25.3 is worth much.

the main problem is that we can do nothing with this matrix: we only 
print it, but then the values get written into a 0/1 sched-domains 
hierarchy - so the information is lost.

if you create a sched-domains hierarchy (based on the SLIT tables, or in 
whatever other way) that matches the CPU hierarchy then you'll 
automatically get the proper distances detected.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Ingo Molnar

* Paul Jackson <[EMAIL PROTECTED]> wrote:

> Three more observations.
> 
>  1) The slowest measure_one() calls are, not surprisingly, those for the
> largest sizes.  At least on my test system of the moment, the plot
> of cost versus size has one major maximum (a one hump camel, not two).
> 
> Seems like if we computed from smallest size upward, instead of largest
> downward, and stopped whenever two consecutive measurements were less
> than say 70% of the max seen so far, then we could save a nice chunk
> of the time.
> 
> Of course, if two hump systems exist, this is not reliable on them.

yes, this is the approach i'm currently working on, but it's not 
reliable yet. (one of the systems i have drifts its cost into infinity 
after the hump, which shouldnt happen)

>  2) Trivial warning fix for printf format mismatch:

thx.

>  3) I was noticing that my test system was only showing a couple of 
> distinct values for cpu_distance, even though it has 4 distinct 
> distances for values of node_distance.  So I coded up a variant of 
> cpu_distance that converts the problem to a node_distance problem, 
> and got the following cost matrix:
> 
> === begin ===
> Total of 8 processors activated (15515.64 BogoMIPS).
> -
> migration cost matrix (max_cache_size: 0, cpu: -1 MHz):
> -
>   [00][01][02][03][04][05][06][07]
> [00]: - 4.0(0) 21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
> [01]:   4.0(0)-21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
> [02]:  21.7(1) 21.7(1)- 4.0(0) 25.3(3) 25.3(3) 25.2(2) 25.2(2)
> [03]:  21.7(1) 21.7(1)  4.0(0)-25.3(3) 25.3(3) 25.2(2) 25.2(2)
> [04]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)- 4.0(0) 21.7(1) 21.7(1)
> [05]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)  4.0(0)-21.7(1) 21.7(1)
> [06]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)- 4.0(0)
> [07]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)  4.0(0)-
> -
> cacheflush times [4]: 4.0 (4080540) 21.7 (21781380) 25.2 (25259428) 25.3 
> (25372682)

i'll first try the bottom-up approach to speed up detection (getting to
the hump is very fast most of the time). The hard part was to create a
workload that generates the hump reliably on a number of boxes - i'm
happy it works on ia64 too.

then we can let the arch override the cpu_distance() method, although i
do think that _if_ there is a significant hierarchy between CPUs it
should be represented via a matching sched-domains hierarchy, and the
full hierarchy should be tuned accordingly.

btw., the migration cost matrix we can later use to tune all the other 
sched-domains balancing related tunables as well - cache_hot_time is 
just the first obvious step. (which also happens to make the most 
difference.)

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Ingo Molnar

* Paul Jackson <[EMAIL PROTECTED]> wrote:

> Ok - that flies, or at least walks.  It took 53 seconds to compute 
> this cost matrix.

53 seconds is too much - i'm working on reducing it.

> Here's what it prints, on a small 8 CPU ia64 SN2 Altix, with
> the migration_debug prints formatted separately from the primary
> table, for ease of reading:
> 
> Total of 8 processors activated (15548.60 BogoMIPS).
> -
> migration cost matrix (max_cache_size: 0, cpu: -1 MHz):
> -
>   [00][01][02][03][04][05][06][07]
> [00]: - 4.0(0) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
> [01]:   4.0(0)-21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
> [02]:  21.7(1) 21.7(1)- 4.0(0) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
> [03]:  21.7(1) 21.7(1)  4.0(0)-21.7(1) 21.7(1) 21.7(1) 21.7(1)
> [04]:  21.7(1) 21.7(1) 21.7(1) 21.7(1)- 4.0(0) 21.7(1) 21.7(1)
> [05]:  21.7(1) 21.7(1) 21.7(1) 21.7(1)  4.0(0)-21.7(1) 21.7(1)
> [06]:  21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)- 4.0(0)
> [07]:  21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)  4.0(0)-

how close are these numbers to the real worst-case migration costs on 
that box? What are the cache sizes and what is their hierarchies?

i've attached another snapshot - there is no speedup yet, but i've 
changed the debug printout to be separate from the matrix printout, and 
i've fixed the cache_size printout. (the printout of a 68K cache was 
incorrect - that was just the last iteration step)

it will be interesting to see what effect the above assymetry in 
migration costs will have on scheduling. With 4msec intra-node cutoff it 
should be pretty migration-happy, inter-node 21 msec is rather high and 
should avoid unnecessary migration.

is there any workload that shows the same scheduling related performance 
regressions, other than Ken's $1m+ benchmark kit?

Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -4639,6 +4640,453 @@ void __devinit init_sched_build_groups(s
last->next = first;
 }
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads a big buffer to flush caches
+ * 2) the source CPU reads+dirties a shared buffer 
+ * 3) the target CPU reads+dirties the same shared buffer
+ * 4) the target CPU reads a big buffer to flush caches
+ *
+ * We measure how long steps #2 and #3 take (step #1 and #4 is not
+ * measured), in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a large buffer-size and iterate down to smaller
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * do a maximum search for the cost. The maximum cost for a migration
+ * occurs when the working set is just below the effective cache size.
+ */
+
+
+/*
+ * Flush the cache by reading a big buffer. (We want all writeback
+ * activities to subside. Works only if cache size is larger than
+ * 2*size, but that is good enough as the biggest migration effect
+ * is around cachesize size.)
+ */
+__init static void read_cache(void *__cache, unsigned long __size)
+{
+   unsigned long size = __size/sizeof(long);
+   unsigned long *cache = __cache;
+   volatile unsigned long data;
+   int i;
+
+   for (i = 0; i < 2*size; i += 4)
+   data = cache[i];
+}
+
+
+/*
+ * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
+ * is the operation that is timed, so we try to generate unpredictable
+ * cachemisses that still end up filling the L2 cache:
+ */
+__init static void touch_cache(void *__cache, unsigned long __size)
+{
+   unsigned long size = __size/sizeof(long), chunk1 = size/3,
+   chunk2 = 2*size/3;
+   unsigned long *cache = __cache;
+   int i;
+
+   for (i = 0; i < size/6; i += 4) {
+   switch (i % 6) {
+   case 0: cache[i]++;
+   case 1: cache[size-1-i]++;
+   case 2: cache[chunk1-i]++;
+   case 3: cache[chunk1+i]++;
+   case 4: cache[chunk2-i]++;
+   case 5: cache[chunk2+i]++;
+   }
+   }
+}
+
+struct flush_data {
+   unsigned long source, target;
+   void (*fn)(void *, unsigned long);
+   void *cache;
+   unsigned long size;
+   unsigned long long delta;
+};
+
+/*
+ * Dirty L2 on the source CPU:
+ */
+__init static 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Three more observations.

 1) The slowest measure_one() calls are, not surprisingly, those for the
largest sizes.  At least on my test system of the moment, the plot
of cost versus size has one major maximum (a one hump camel, not two).

Seems like if we computed from smallest size upward, instead of largest
downward, and stopped whenever two consecutive measurements were less
than say 70% of the max seen so far, then we could save a nice chunk
of the time.

Of course, if two hump systems exist, this is not reliable on them.

 2) Trivial warning fix for printf format mismatch:

=== begin ===
--- 2.6.12-rc1-mm4.orig/kernel/sched.c  2005-04-03 06:32:34.0 -0700
+++ 2.6.12-rc1-mm4/kernel/sched.c   2005-04-03 06:34:07.0 -0700
@@ -5211,7 +5211,7 @@ void __devinit calibrate_migration_costs
 #ifdef CONFIG_X86
cpu_khz/1000
 #else
-   -1
+   -1L
 #endif
);
printk("-\n");
 end 


 3) I was noticing that my test system was only showing a couple of distinct
values for cpu_distance, even though it has 4 distinct distances for
values of node_distance.  So I coded up a variant of cpu_distance that
converts the problem to a node_distance problem, and got the following
cost matrix:

=== begin ===
Total of 8 processors activated (15515.64 BogoMIPS).
-
migration cost matrix (max_cache_size: 0, cpu: -1 MHz):
-
  [00][01][02][03][04][05][06][07]
[00]: - 4.0(0) 21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
[01]:   4.0(0)-21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
[02]:  21.7(1) 21.7(1)- 4.0(0) 25.3(3) 25.3(3) 25.2(2) 25.2(2)
[03]:  21.7(1) 21.7(1)  4.0(0)-25.3(3) 25.3(3) 25.2(2) 25.2(2)
[04]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)- 4.0(0) 21.7(1) 21.7(1)
[05]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)  4.0(0)-21.7(1) 21.7(1)
[06]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)- 4.0(0)
[07]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)  4.0(0)-
-
cacheflush times [4]: 4.0 (4080540) 21.7 (21781380) 25.2 (25259428) 25.3 
(25372682)
-
 end 


The code (below) is twice as complicated, the runtime twice as long,
and it's less intuitive - sched_domains seems more appropriate as
the basis for migration costs than the node distances in SLIT tables.
Finally, I don't know if distinguishing between costs of 21.7 and
25.3 is worth much.

So the case for switching to this node_distance base is less than
persuasive, to put it politely.

Perhaps it's only real value is in highlighting that perhaps the
code to setup the sched_domain topology on our ia64 SN2 Altix systems
is too coarse, given that it only found two distance values, not four.

If that's the case, I will have to call in someone else to examine
whether it's appropriate to refine the sched_domains setup for this
kind of system.  I'm not competent to determine that, nor to code it.

Here's the code that bases cpu_distance on node_distance:

=== begin ===
__init static int cmpint(const void *a, const void *b)
{
return *(int *)a - *(int *)b;
}

/*
 * Estimate distance of two CPUs based on their node_distance,
 * mapping to sequential integers 0, 1, ... N-1, for the N
 * distinct values of distances (closest CPUs are distance 0,
 * farthest CPUs are distance N-1).  If there are more than
 * MAX_DOMAIN_DISTANCE distinct different distance values,
 * collapse the larger distances to one value.
 */

__init static unsigned long cpu_distance(int cpu1, int cpu2)
{
static int num_dist_vals;
static int node_distances[MAX_DOMAIN_DISTANCE];
int dist = node_distance(cpu_to_node(cpu1), cpu_to_node(cpu2));
int v;

if (num_dist_vals == 0) {
int i, j, k;

/*
 * For each dist not already in node_distances[], if there's
 * room or it's less than an existing 'luser' entry, add it.
 */
for_each_online_node(i) {
for_each_online_node(j) {
int dist = node_distance(i, j);
int luser = -1;

for (k = 0; k < num_dist_vals; k++) {
if (node_distances[k] == dist)
break;
if (dist < node_distances[k])
 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Ok - that flies, or at least walks.  It took 53 seconds to
compute this cost matrix.

Here's what it prints, on a small 8 CPU ia64 SN2 Altix, with
the migration_debug prints formatted separately from the primary
table, for ease of reading:

Total of 8 processors activated (15548.60 BogoMIPS).
-
migration cost matrix (max_cache_size: 0, cpu: -1 MHz):
-
  [00][01][02][03][04][05][06][07]
[00]: - 4.0(0) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
[01]:   4.0(0)-21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
[02]:  21.7(1) 21.7(1)- 4.0(0) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
[03]:  21.7(1) 21.7(1)  4.0(0)-21.7(1) 21.7(1) 21.7(1) 21.7(1)
[04]:  21.7(1) 21.7(1) 21.7(1) 21.7(1)- 4.0(0) 21.7(1) 21.7(1)
[05]:  21.7(1) 21.7(1) 21.7(1) 21.7(1)  4.0(0)-21.7(1) 21.7(1)
[06]:  21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)- 4.0(0)
[07]:  21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)  4.0(0)-
-
cacheflush times [2]: 4.0 (4059264) 21.7 (21764604)
-

-> [0][1][10485760]   0.0 (0): (236441590 260844347 -24402757)
-> [0][1][9961472]   0.0 (0): (223517112 247446351 -23929239)
-> [0][1][9463398]   0.0 (0): (210676318 234128642 -23452324)
-> [0][1][8990228]   0.0 (0): (199150391 222962366 -23811975)
-> [0][1][8540716]   0.0 (0): (188000682 211792893 -23792211)
-> [0][1][8113680]   0.0 (0): (177705384 201661649 -23956265)
-> [0][1][7707996]   0.0 (0): (167300335 190993072 -23692737)
-> [0][1][7322596]   0.0 (0): (157792762 181764189 -23971427)
-> [0][1][6956466]   0.0 (0): (148554966 172428430 -23873464)
-> [0][1][6608642]   0.0 (0): (140208195 163875201 -23667006)
-> [0][1][6278209]   0.0 (0): (131352820 155083956 -23731136)
-> [0][1][5964298]   0.0 (0): (123604215 147567322 -23963107)
-> [0][1][5666083]   0.0 (0): (116411565 140028494 -23616929)
-> [0][1][5382778]   0.0 (0): (109268755 133013626 -23744871)
-> [0][1][5113639]   0.0 (0): (102398180 126017425 -23619245)
-> [0][1][4857957]   0.0 (0): (95917364 119835534 -23918170)
-> [0][1][4615059]   0.0 (0): (90016707 114103575 -24086868)
-> [0][1][4384306]   0.0 (0): (84323765 108006547 -23682782)
-> [0][1][4165090]   0.0 (0): (79059754 102627005 -23567251)
-> [0][1][3956835]   0.0 (0): (73688423 97291492 -23603069)
-> [0][1][3758993]   0.0 (0): (68716008 88560989 -19844981)
-> [0][1][3571043]   0.0 (0): (63733160 81897350 -18164190)
-> [0][1][3392490]   0.0 (0): (59879383 74232277 -14352894)
-> [0][1][3222865]   0.0 (0): (56841544 66555118 -9713574)
-> [0][1][3061721]   0.0 (0): (52946522 56831787 -3885265)
-> [0][1][2908634]   2.1 (0): (48782033 46610015  2172018)
-> [0][1][2763202]   7.4 (0): (45641483 38180422  7461061)
-> [0][1][2625041]   8.1 (0): (42666487 34547956  8118531)
-> [0][1][2493788]   8.1 (0): (40480659 32408260  8072399)
-> [0][1][2369098]   8.1 (0): (37962874 30163246  7799628)
-> [0][1][2250643]   8.1 (0): (34472406 26857206  7615200)
-> [0][1][2138110]   8.1 (0): (31271314 23649223  7622091)
-> [0][1][2031204]   8.1 (0): (28089754 21439413  6650341)
-> [0][1][1929643]   8.1 (0): (26354009 18543359  7810650)
-> [0][1][1833160]   8.1 (0): (21147235 14447434  6699801)
-> [0][1][1741502]   8.1 (0): (18121355 12206595  5914760)
-> [0][1][1654426]   8.1 (0): (15329605 10598656  4730949)
-> [0][1][1571704]   8.1 (0): (13611633  8689517  4922116)
-> [0][1][1493118]   8.1 (0): (11372044  6757841  4614203)
-> [0][1][1418462]   8.1 (0): ( 9444150  4882452  4561698)
-> [0][1][1347538]   8.1 (0): ( 8191406  4085242  4106164)
-> [0][1][1280161]   8.1 (0): ( 7790609  3898213  3892396)
-> [0][1][1216152]   8.1 (0): ( 7374407  3707184  3667223)
-> [0][1][1155344]   8.1 (0): ( 6999015  3515903  3483112)
-> [0][1][1097576]   8.1 (0): ( 6673248  3322754  3350494)
-> [0][1][1042697]   8.1 (0): ( 6335524  3161843  3173681)
-> [0][1][ 990562]   8.1 (0): ( 6004402  3008483  2995919)
-> [0][1][ 941033]   8.1 (0): ( 5725906  2863829  2862077)
-> [0][1][ 893981]   8.1 (0): ( 5426110  2734901  2691209)
-> [0][1][ 849281]   8.1 (0): ( 5140906  2596169  2544737)
-> [0][1][ 806816]   8.1 (0): ( 4898502  2465125  2433377)
-> [0][1][ 766475]   8.1 (0): ( 4649361  2349720  2299641)
-> [0][1][ 728151]   8.1 (0): ( 4427640  2224358  2203282)
-> [0][1][ 691743]   8.1 (0): ( 4205722  2113134  2092588)
-> [0][1][ 657155]   8.1 (0): ( 3991213  1997003  1994210)
-> [0][1][ 624297]   8.1 (0): ( 3808184  1922251  1885933)
-> [0][1][ 593082]   8.1 (0): ( 3637960  1824619  1813341)
-> [0][1][ 563427]   8.1 (0): ( 3436507  1717571  1718936)
-> [0][1][ 535255]   8.1 (0): ( 3258815  1638947  1619868)
-> [0][1][ 508492]   8.1 (0): ( 310  1554970  1552807)
-> [0][1][ 483067]   8.1 (0): ( 2947291  1476728  1470563)
-> [0][1][ 458913]   8.1 (0): ( 2791433  1408435  1382998)
-> [0][1][ 435967]   8.1 (0): ( 2652944  1322870  1330074)
-> [0][1][ 414168]   8.1 (0): ( 2535588  1270619  1264969)
-> [0][1][ 393459]   8.1 (0): ( 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Earlier, Paul wrote:
> Note the first 3 chars of the panic message "4.5".  This looks like it
> might be the [00]-[01] entry of Ingo's table, flushed out when the
> newlines of the panic came through.

For the record, the above speculation is probably wrong.

More likely, the first six characters "4.5(0)" of my quoted panic
message came out some time before the panic, and represent the the
[0]-[1] entry of the table.  These six chars came out at approx.
nine minutes into the calculation, and the timer panic'd the system at
ten minutes.  I didn't look at the screen between the 9th and 10th
minute, to realize that it had finally computed one table entry.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
> the default on ia64 (32MB) was way too large

Agreed.  It took about 9 minutes to search the first pair of cpus
(cpu 0 to cpu 1) from a size of 67107840 down to a size of 62906,
based on some prints I added since my last message.


> it seems the screen blanking timer hit

Ah - yes.  That makes sense.


> do a search from 10MB downwards. This
>   should speed up the search.

That will help (I'm guessing not enough - will see shortly.)


> verbose printouts 

I will put them to good use.

Thanks.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
 the default on ia64 (32MB) was way too large

Agreed.  It took about 9 minutes to search the first pair of cpus
(cpu 0 to cpu 1) from a size of 67107840 down to a size of 62906,
based on some prints I added since my last message.


 it seems the screen blanking timer hit

Ah - yes.  That makes sense.


 do a search from 10MB downwards. This
   should speed up the search.

That will help (I'm guessing not enough - will see shortly.)


 verbose printouts 

I will put them to good use.

Thanks.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Earlier, Paul wrote:
 Note the first 3 chars of the panic message 4.5.  This looks like it
 might be the [00]-[01] entry of Ingo's table, flushed out when the
 newlines of the panic came through.

For the record, the above speculation is probably wrong.

More likely, the first six characters 4.5(0) of my quoted panic
message came out some time before the panic, and represent the the
[0]-[1] entry of the table.  These six chars came out at approx.
nine minutes into the calculation, and the timer panic'd the system at
ten minutes.  I didn't look at the screen between the 9th and 10th
minute, to realize that it had finally computed one table entry.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Ok - that flies, or at least walks.  It took 53 seconds to
compute this cost matrix.

Here's what it prints, on a small 8 CPU ia64 SN2 Altix, with
the migration_debug prints formatted separately from the primary
table, for ease of reading:

Total of 8 processors activated (15548.60 BogoMIPS).
-
migration cost matrix (max_cache_size: 0, cpu: -1 MHz):
-
  [00][01][02][03][04][05][06][07]
[00]: - 4.0(0) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
[01]:   4.0(0)-21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
[02]:  21.7(1) 21.7(1)- 4.0(0) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
[03]:  21.7(1) 21.7(1)  4.0(0)-21.7(1) 21.7(1) 21.7(1) 21.7(1)
[04]:  21.7(1) 21.7(1) 21.7(1) 21.7(1)- 4.0(0) 21.7(1) 21.7(1)
[05]:  21.7(1) 21.7(1) 21.7(1) 21.7(1)  4.0(0)-21.7(1) 21.7(1)
[06]:  21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)- 4.0(0)
[07]:  21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)  4.0(0)-
-
cacheflush times [2]: 4.0 (4059264) 21.7 (21764604)
-

- [0][1][10485760]   0.0 (0): (236441590 260844347 -24402757)
- [0][1][9961472]   0.0 (0): (223517112 247446351 -23929239)
- [0][1][9463398]   0.0 (0): (210676318 234128642 -23452324)
- [0][1][8990228]   0.0 (0): (199150391 222962366 -23811975)
- [0][1][8540716]   0.0 (0): (188000682 211792893 -23792211)
- [0][1][8113680]   0.0 (0): (177705384 201661649 -23956265)
- [0][1][7707996]   0.0 (0): (167300335 190993072 -23692737)
- [0][1][7322596]   0.0 (0): (157792762 181764189 -23971427)
- [0][1][6956466]   0.0 (0): (148554966 172428430 -23873464)
- [0][1][6608642]   0.0 (0): (140208195 163875201 -23667006)
- [0][1][6278209]   0.0 (0): (131352820 155083956 -23731136)
- [0][1][5964298]   0.0 (0): (123604215 147567322 -23963107)
- [0][1][5666083]   0.0 (0): (116411565 140028494 -23616929)
- [0][1][5382778]   0.0 (0): (109268755 133013626 -23744871)
- [0][1][5113639]   0.0 (0): (102398180 126017425 -23619245)
- [0][1][4857957]   0.0 (0): (95917364 119835534 -23918170)
- [0][1][4615059]   0.0 (0): (90016707 114103575 -24086868)
- [0][1][4384306]   0.0 (0): (84323765 108006547 -23682782)
- [0][1][4165090]   0.0 (0): (79059754 102627005 -23567251)
- [0][1][3956835]   0.0 (0): (73688423 97291492 -23603069)
- [0][1][3758993]   0.0 (0): (68716008 88560989 -19844981)
- [0][1][3571043]   0.0 (0): (63733160 81897350 -18164190)
- [0][1][3392490]   0.0 (0): (59879383 74232277 -14352894)
- [0][1][3222865]   0.0 (0): (56841544 66555118 -9713574)
- [0][1][3061721]   0.0 (0): (52946522 56831787 -3885265)
- [0][1][2908634]   2.1 (0): (48782033 46610015  2172018)
- [0][1][2763202]   7.4 (0): (45641483 38180422  7461061)
- [0][1][2625041]   8.1 (0): (42666487 34547956  8118531)
- [0][1][2493788]   8.1 (0): (40480659 32408260  8072399)
- [0][1][2369098]   8.1 (0): (37962874 30163246  7799628)
- [0][1][2250643]   8.1 (0): (34472406 26857206  7615200)
- [0][1][2138110]   8.1 (0): (31271314 23649223  7622091)
- [0][1][2031204]   8.1 (0): (28089754 21439413  6650341)
- [0][1][1929643]   8.1 (0): (26354009 18543359  7810650)
- [0][1][1833160]   8.1 (0): (21147235 14447434  6699801)
- [0][1][1741502]   8.1 (0): (18121355 12206595  5914760)
- [0][1][1654426]   8.1 (0): (15329605 10598656  4730949)
- [0][1][1571704]   8.1 (0): (13611633  8689517  4922116)
- [0][1][1493118]   8.1 (0): (11372044  6757841  4614203)
- [0][1][1418462]   8.1 (0): ( 9444150  4882452  4561698)
- [0][1][1347538]   8.1 (0): ( 8191406  4085242  4106164)
- [0][1][1280161]   8.1 (0): ( 7790609  3898213  3892396)
- [0][1][1216152]   8.1 (0): ( 7374407  3707184  3667223)
- [0][1][1155344]   8.1 (0): ( 6999015  3515903  3483112)
- [0][1][1097576]   8.1 (0): ( 6673248  3322754  3350494)
- [0][1][1042697]   8.1 (0): ( 6335524  3161843  3173681)
- [0][1][ 990562]   8.1 (0): ( 6004402  3008483  2995919)
- [0][1][ 941033]   8.1 (0): ( 5725906  2863829  2862077)
- [0][1][ 893981]   8.1 (0): ( 5426110  2734901  2691209)
- [0][1][ 849281]   8.1 (0): ( 5140906  2596169  2544737)
- [0][1][ 806816]   8.1 (0): ( 4898502  2465125  2433377)
- [0][1][ 766475]   8.1 (0): ( 4649361  2349720  2299641)
- [0][1][ 728151]   8.1 (0): ( 4427640  2224358  2203282)
- [0][1][ 691743]   8.1 (0): ( 4205722  2113134  2092588)
- [0][1][ 657155]   8.1 (0): ( 3991213  1997003  1994210)
- [0][1][ 624297]   8.1 (0): ( 3808184  1922251  1885933)
- [0][1][ 593082]   8.1 (0): ( 3637960  1824619  1813341)
- [0][1][ 563427]   8.1 (0): ( 3436507  1717571  1718936)
- [0][1][ 535255]   8.1 (0): ( 3258815  1638947  1619868)
- [0][1][ 508492]   8.1 (0): ( 310  1554970  1552807)
- [0][1][ 483067]   8.1 (0): ( 2947291  1476728  1470563)
- [0][1][ 458913]   8.1 (0): ( 2791433  1408435  1382998)
- [0][1][ 435967]   8.1 (0): ( 2652944  1322870  1330074)
- [0][1][ 414168]   8.1 (0): ( 2535588  1270619  1264969)
- [0][1][ 393459]   8.1 (0): ( 2412219  1213071  1199148)
- [0][1][ 373786]   8.1 (0): ( 2282233  

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Three more observations.

 1) The slowest measure_one() calls are, not surprisingly, those for the
largest sizes.  At least on my test system of the moment, the plot
of cost versus size has one major maximum (a one hump camel, not two).

Seems like if we computed from smallest size upward, instead of largest
downward, and stopped whenever two consecutive measurements were less
than say 70% of the max seen so far, then we could save a nice chunk
of the time.

Of course, if two hump systems exist, this is not reliable on them.

 2) Trivial warning fix for printf format mismatch:

=== begin ===
--- 2.6.12-rc1-mm4.orig/kernel/sched.c  2005-04-03 06:32:34.0 -0700
+++ 2.6.12-rc1-mm4/kernel/sched.c   2005-04-03 06:34:07.0 -0700
@@ -5211,7 +5211,7 @@ void __devinit calibrate_migration_costs
 #ifdef CONFIG_X86
cpu_khz/1000
 #else
-   -1
+   -1L
 #endif
);
printk(-\n);
 end 


 3) I was noticing that my test system was only showing a couple of distinct
values for cpu_distance, even though it has 4 distinct distances for
values of node_distance.  So I coded up a variant of cpu_distance that
converts the problem to a node_distance problem, and got the following
cost matrix:

=== begin ===
Total of 8 processors activated (15515.64 BogoMIPS).
-
migration cost matrix (max_cache_size: 0, cpu: -1 MHz):
-
  [00][01][02][03][04][05][06][07]
[00]: - 4.0(0) 21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
[01]:   4.0(0)-21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
[02]:  21.7(1) 21.7(1)- 4.0(0) 25.3(3) 25.3(3) 25.2(2) 25.2(2)
[03]:  21.7(1) 21.7(1)  4.0(0)-25.3(3) 25.3(3) 25.2(2) 25.2(2)
[04]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)- 4.0(0) 21.7(1) 21.7(1)
[05]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)  4.0(0)-21.7(1) 21.7(1)
[06]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)- 4.0(0)
[07]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)  4.0(0)-
-
cacheflush times [4]: 4.0 (4080540) 21.7 (21781380) 25.2 (25259428) 25.3 
(25372682)
-
 end 


The code (below) is twice as complicated, the runtime twice as long,
and it's less intuitive - sched_domains seems more appropriate as
the basis for migration costs than the node distances in SLIT tables.
Finally, I don't know if distinguishing between costs of 21.7 and
25.3 is worth much.

So the case for switching to this node_distance base is less than
persuasive, to put it politely.

Perhaps it's only real value is in highlighting that perhaps the
code to setup the sched_domain topology on our ia64 SN2 Altix systems
is too coarse, given that it only found two distance values, not four.

If that's the case, I will have to call in someone else to examine
whether it's appropriate to refine the sched_domains setup for this
kind of system.  I'm not competent to determine that, nor to code it.

Here's the code that bases cpu_distance on node_distance:

=== begin ===
__init static int cmpint(const void *a, const void *b)
{
return *(int *)a - *(int *)b;
}

/*
 * Estimate distance of two CPUs based on their node_distance,
 * mapping to sequential integers 0, 1, ... N-1, for the N
 * distinct values of distances (closest CPUs are distance 0,
 * farthest CPUs are distance N-1).  If there are more than
 * MAX_DOMAIN_DISTANCE distinct different distance values,
 * collapse the larger distances to one value.
 */

__init static unsigned long cpu_distance(int cpu1, int cpu2)
{
static int num_dist_vals;
static int node_distances[MAX_DOMAIN_DISTANCE];
int dist = node_distance(cpu_to_node(cpu1), cpu_to_node(cpu2));
int v;

if (num_dist_vals == 0) {
int i, j, k;

/*
 * For each dist not already in node_distances[], if there's
 * room or it's less than an existing 'luser' entry, add it.
 */
for_each_online_node(i) {
for_each_online_node(j) {
int dist = node_distance(i, j);
int luser = -1;

for (k = 0; k  num_dist_vals; k++) {
if (node_distances[k] == dist)
break;
if (dist  node_distances[k])
 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Ingo Molnar

* Paul Jackson [EMAIL PROTECTED] wrote:

 Ok - that flies, or at least walks.  It took 53 seconds to compute 
 this cost matrix.

53 seconds is too much - i'm working on reducing it.

 Here's what it prints, on a small 8 CPU ia64 SN2 Altix, with
 the migration_debug prints formatted separately from the primary
 table, for ease of reading:
 
 Total of 8 processors activated (15548.60 BogoMIPS).
 -
 migration cost matrix (max_cache_size: 0, cpu: -1 MHz):
 -
   [00][01][02][03][04][05][06][07]
 [00]: - 4.0(0) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
 [01]:   4.0(0)-21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
 [02]:  21.7(1) 21.7(1)- 4.0(0) 21.7(1) 21.7(1) 21.7(1) 21.7(1)
 [03]:  21.7(1) 21.7(1)  4.0(0)-21.7(1) 21.7(1) 21.7(1) 21.7(1)
 [04]:  21.7(1) 21.7(1) 21.7(1) 21.7(1)- 4.0(0) 21.7(1) 21.7(1)
 [05]:  21.7(1) 21.7(1) 21.7(1) 21.7(1)  4.0(0)-21.7(1) 21.7(1)
 [06]:  21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)- 4.0(0)
 [07]:  21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1) 21.7(1)  4.0(0)-

how close are these numbers to the real worst-case migration costs on 
that box? What are the cache sizes and what is their hierarchies?

i've attached another snapshot - there is no speedup yet, but i've 
changed the debug printout to be separate from the matrix printout, and 
i've fixed the cache_size printout. (the printout of a 68K cache was 
incorrect - that was just the last iteration step)

it will be interesting to see what effect the above assymetry in 
migration costs will have on scheduling. With 4msec intra-node cutoff it 
should be pretty migration-happy, inter-node 21 msec is rather high and 
should avoid unnecessary migration.

is there any workload that shows the same scheduling related performance 
regressions, other than Ken's $1m+ benchmark kit?

Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -47,6 +47,7 @@
 #include linux/syscalls.h
 #include linux/times.h
 #include linux/acct.h
+#include linux/vmalloc.h
 #include asm/tlb.h
 
 #include asm/unistd.h
@@ -4639,6 +4640,453 @@ void __devinit init_sched_build_groups(s
last-next = first;
 }
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads a big buffer to flush caches
+ * 2) the source CPU reads+dirties a shared buffer 
+ * 3) the target CPU reads+dirties the same shared buffer
+ * 4) the target CPU reads a big buffer to flush caches
+ *
+ * We measure how long steps #2 and #3 take (step #1 and #4 is not
+ * measured), in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a large buffer-size and iterate down to smaller
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * do a maximum search for the cost. The maximum cost for a migration
+ * occurs when the working set is just below the effective cache size.
+ */
+
+
+/*
+ * Flush the cache by reading a big buffer. (We want all writeback
+ * activities to subside. Works only if cache size is larger than
+ * 2*size, but that is good enough as the biggest migration effect
+ * is around cachesize size.)
+ */
+__init static void read_cache(void *__cache, unsigned long __size)
+{
+   unsigned long size = __size/sizeof(long);
+   unsigned long *cache = __cache;
+   volatile unsigned long data;
+   int i;
+
+   for (i = 0; i  2*size; i += 4)
+   data = cache[i];
+}
+
+
+/*
+ * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
+ * is the operation that is timed, so we try to generate unpredictable
+ * cachemisses that still end up filling the L2 cache:
+ */
+__init static void touch_cache(void *__cache, unsigned long __size)
+{
+   unsigned long size = __size/sizeof(long), chunk1 = size/3,
+   chunk2 = 2*size/3;
+   unsigned long *cache = __cache;
+   int i;
+
+   for (i = 0; i  size/6; i += 4) {
+   switch (i % 6) {
+   case 0: cache[i]++;
+   case 1: cache[size-1-i]++;
+   case 2: cache[chunk1-i]++;
+   case 3: cache[chunk1+i]++;
+   case 4: cache[chunk2-i]++;
+   case 5: cache[chunk2+i]++;
+   }
+   }
+}
+
+struct flush_data {
+   unsigned long source, target;
+   void (*fn)(void *, unsigned long);
+   void *cache;
+   unsigned long size;
+   unsigned long long delta;
+};
+
+/*
+ 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Ingo Molnar

* Paul Jackson [EMAIL PROTECTED] wrote:

 Three more observations.
 
  1) The slowest measure_one() calls are, not surprisingly, those for the
 largest sizes.  At least on my test system of the moment, the plot
 of cost versus size has one major maximum (a one hump camel, not two).
 
 Seems like if we computed from smallest size upward, instead of largest
 downward, and stopped whenever two consecutive measurements were less
 than say 70% of the max seen so far, then we could save a nice chunk
 of the time.
 
 Of course, if two hump systems exist, this is not reliable on them.

yes, this is the approach i'm currently working on, but it's not 
reliable yet. (one of the systems i have drifts its cost into infinity 
after the hump, which shouldnt happen)

  2) Trivial warning fix for printf format mismatch:

thx.

  3) I was noticing that my test system was only showing a couple of 
 distinct values for cpu_distance, even though it has 4 distinct 
 distances for values of node_distance.  So I coded up a variant of 
 cpu_distance that converts the problem to a node_distance problem, 
 and got the following cost matrix:
 
 === begin ===
 Total of 8 processors activated (15515.64 BogoMIPS).
 -
 migration cost matrix (max_cache_size: 0, cpu: -1 MHz):
 -
   [00][01][02][03][04][05][06][07]
 [00]: - 4.0(0) 21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
 [01]:   4.0(0)-21.7(1) 21.7(1) 25.2(2) 25.2(2) 25.3(3) 25.3(3)
 [02]:  21.7(1) 21.7(1)- 4.0(0) 25.3(3) 25.3(3) 25.2(2) 25.2(2)
 [03]:  21.7(1) 21.7(1)  4.0(0)-25.3(3) 25.3(3) 25.2(2) 25.2(2)
 [04]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)- 4.0(0) 21.7(1) 21.7(1)
 [05]:  25.2(2) 25.2(2) 25.3(3) 25.3(3)  4.0(0)-21.7(1) 21.7(1)
 [06]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)- 4.0(0)
 [07]:  25.3(3) 25.3(3) 25.2(2) 25.2(2) 21.7(1) 21.7(1)  4.0(0)-
 -
 cacheflush times [4]: 4.0 (4080540) 21.7 (21781380) 25.2 (25259428) 25.3 
 (25372682)

i'll first try the bottom-up approach to speed up detection (getting to
the hump is very fast most of the time). The hard part was to create a
workload that generates the hump reliably on a number of boxes - i'm
happy it works on ia64 too.

then we can let the arch override the cpu_distance() method, although i
do think that _if_ there is a significant hierarchy between CPUs it
should be represented via a matching sched-domains hierarchy, and the
full hierarchy should be tuned accordingly.

btw., the migration cost matrix we can later use to tune all the other 
sched-domains balancing related tunables as well - cache_hot_time is 
just the first obvious step. (which also happens to make the most 
difference.)

Ingo
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Ingo Molnar

* Paul Jackson [EMAIL PROTECTED] wrote:

 
  3) I was noticing that my test system was only showing a couple of 
 distinct values for cpu_distance, even though it has 4 distinct 
 distances for values of node_distance.  So I coded up a variant of 
 cpu_distance that converts the problem to a node_distance problem, 
 and got the following cost matrix:

 The code (below) is twice as complicated, the runtime twice as long,
 and it's less intuitive - sched_domains seems more appropriate as
 the basis for migration costs than the node distances in SLIT tables.
 Finally, I don't know if distinguishing between costs of 21.7 and
 25.3 is worth much.

the main problem is that we can do nothing with this matrix: we only 
print it, but then the values get written into a 0/1 sched-domains 
hierarchy - so the information is lost.

if you create a sched-domains hierarchy (based on the SLIT tables, or in 
whatever other way) that matches the CPU hierarchy then you'll 
automatically get the proper distances detected.

Ingo
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Ingo wrote:
 if_ there is a significant hierarchy between CPUs it
 should be represented via a matching sched-domains hierarchy,

Agreed.

I'll see how the sched domains hierarchy looks on a bigger SN2 systems.

If the CPU hierarchy is not reflected in the sched-domain hierarchy any
better there, then I will look to involve the SN2 sched domain
hierarchy experts in improving SN2 the sched-domain hierarchy.

Ok - that works.  Your patch of yesterday provides just the tool
I need to measure this.  Cool.

 i'll first try the bottom-up approach to speed up detection (getting to
 the hump is very fast most of the time).

Good.

 then we can let the arch override the cpu_distance() method

I'm not aware we need that, yet anyway.  First I should see if
the SN2 sched_domains need improving.  Take a shot at doing it
'the right way' before we go inventing overrides.  I suspect
you agree.

 the migration cost matrix we can later use to tune all the other 
 sched-domains balancing related tunables as well

That comes close to my actual motivation here.  I hope to expose a
cpu_distance such as based on this cost matrix, to userland.

We already expose the SLIT table node distances (using SN2 specific
/proc files today, others are working on an arch-neutral mechanism).

As we push more cores and hyperthreads into a single package on one end,
and more complex numa topologies on the other end, this becomes
increasingly interesting to NUMA aware user software.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Ingo wrote:
 if you create a sched-domains hierarchy (based on the SLIT tables, or in 
 whatever other way) that matches the CPU hierarchy then you'll 
 automatically get the proper distances detected.

Yes - agreed.  I should push in the direction of improving the
SN2 sched domain hierarchy.


Would be a good idea to rename 'cpu_distance()' to something more
specific, like 'cpu_dist_ndx()', and reserve the generic name
'cpu_distance()' for later use to return a scaled integer distance,
rather like 'node_distance()' does now.  For example, 'cpu_distance()'
might, someday, return integer values such as:

40  217  252  253

as are displayed (in tenths) in the debug line:

-
cacheflush times [4]: 4.0 (4080540) 21.7 (21781380) 25.2 (25259428) 25.3 
(25372682)
-

(that is, the integer (long)cost / 10 - one less zero).

I don't know that we have any use, yet, for this 'cpu_distance()' as a
scaled integer value.  But I'd be more comfortable reserving that name
for that purpose.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Ingo wrote:
 how close are these numbers to the real worst-case migration costs on 
 that box? What are the cache sizes and what is their hierarchies?
  ...
 is there any workload that shows the same scheduling related performance 
 regressions, other than Ken's $1m+ benchmark kit?

I'll have to talk to some people Monday and get back to you.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Chen, Kenneth W
Ingo Molnar wrote on Saturday, April 02, 2005 11:04 PM
 the default on ia64 (32MB) was way too large and caused the search to
 start from 64MB. That can take a _long_ time.

 i've attached a new patch with your changes included, and a couple of
 new things added:

  - removed the 32MB max_cache_size hack from ia64 - it should now fall
back to the default 5MB and do a search from 10MB downwards. This
should speed up the search.

The cache size information on ia64 is already available at the finger tip.
Here is a patch that I whipped up to set max_cache_size for ia64.


--- linux-2.6.12-rc1/arch/ia64/kernel/setup.c.orig  2005-04-03 
17:14:40.0 -0700
+++ linux-2.6.12-rc1/arch/ia64/kernel/setup.c   2005-04-03 17:55:46.0 
-0700
@@ -561,6 +561,7 @@ static void
 get_max_cacheline_size (void)
 {
unsigned long line_size, max = 1;
+   unsigned int cache_size = 0;
u64 l, levels, unique_caches;
 pal_cache_config_info_t cci;
 s64 status;
@@ -585,8 +586,11 @@ get_max_cacheline_size (void)
line_size = 1  cci.pcci_line_size;
if (line_size  max)
max = line_size;
+   if (cache_size  cci.pcci_cache_size)
+   cache_size = cci.pcci_cache_size;
 }
   out:
+   max_cache_size = max(max_cache_size, cache_size);
if (max  ia64_max_cacheline_size)
ia64_max_cacheline_size = max;
 }



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Paul wrote:
 I should push in the direction of improving the
 SN2 sched domain hierarchy.

Nick wrote:
 I'd just be a bit careful about this.

Good point - thanks.

I will - be careful.  I have no delusions that I know what would be an
improvement to the scheduler - if anything.

Ingo, if I understood correctly, suggested pushing any necessary detail
of the CPU hierarchy into the scheduler domains, so that his latest work
tuning migration costs could pick it up from there.

It makes good sense for the migration cost estimation to be based on
whatever CPU hierarchy is visible in the sched domains.

But if we knew the CPU hierarchy in more detail, and if we had some
other use for that detail (we don't that I know), then I take it from
your comment that we should be reluctant to push those details into the
sched domains.  Put them someplace else if we need them.


One question - how serious do you view difference in migration cost
between say 21.7 and 25.3, two of the cacheflush times I reported on a
small SN2?

I'm guessing that this is probably below the noise threshold, at least
as far as scheduler domains, schedulers and migration care, unless and
until some persuasive measurements show a situation in which it matters.

As you say - not an exact science.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Andy Lutomirski
Paul Jackson wrote:
Ok - that flies, or at least walks.  It took 53 seconds to
compute this cost matrix.
Not that I really know what I'm talking about here, but this sounds 
highly parallelizable.  It seems like you could do N/2 measurements at a 
time, so this should be O(N) to compute the matrix (ignoring issues of 
how long it takes to write the data to memory, but that should be 
insignificant).

Even if you can't parallelize it all the way, it ought to at least help.
--Andy
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Andy wrote:
 Not that I really know what I'm talking about here, but this sounds 
 highly parallelizable.

I doubt it.  If we are testing the cost of a migration between CPUs
alpha and beta, and at the same time testing betweeen CPUs gamma and
delta, then often there will be some hardware that is shared by both the
alpha, beta path, and the gamma, delta path.  This would affect the
test results.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Ingo Molnar

* Paul Jackson [EMAIL PROTECTED] wrote:

 Ingo, if I understood correctly, suggested pushing any necessary 
 detail of the CPU hierarchy into the scheduler domains, so that his 
 latest work tuning migration costs could pick it up from there.
 
 It makes good sense for the migration cost estimation to be based on 
 whatever CPU hierarchy is visible in the sched domains.
 
 But if we knew the CPU hierarchy in more detail, and if we had some 
 other use for that detail (we don't that I know), then I take it from 
 your comment that we should be reluctant to push those details into 
 the sched domains.  Put them someplace else if we need them.

There's no other place to push them - most of the hierarchy related 
decisions are done based on the domain tree. So the decision to make is: 
is it worth complicating the domain tree, in exchange for more accurate 
handling of the real hierarchy?.

In general, the pros are potentially more accuracy and thus higher 
application performance, the cons are overhead (more tree walking) and 
artifacts (the sched-domains logic is good but not perfect, and even if 
there were no bugs in it, the decisions are approximations. One more 
domain level might make things worse.)

but trying and benchmarking it is necessary to tell for sure.

Ingo
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Paul Jackson
Ingo wrote:
 There's no other place to push them 

One could make a place, if the need arose.

 but trying and benchmarking it is necessary to tell for sure.

Hard to argue with that ... ;).

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-03 Thread Nick Piggin
On Sun, 2005-04-03 at 20:55 -0700, Paul Jackson wrote:

 But if we knew the CPU hierarchy in more detail, and if we had some
 other use for that detail (we don't that I know), then I take it from
 your comment that we should be reluctant to push those details into the
 sched domains.  Put them someplace else if we need them.
 

In a sense, the information *is* already there - in node_distance.
What I think should be done is probably to use node_distance when
calculating costs, and correlate that with sched-domains as best
we can.

I've got an idea of how to do it, but I'll wait until Ingo gets the
fundamentals working wel before I have a look.

 
 One question - how serious do you view difference in migration cost
 between say 21.7 and 25.3, two of the cacheflush times I reported on a
 small SN2?
 
 I'm guessing that this is probably below the noise threshold, at least
 as far as scheduler domains, schedulers and migration care, unless and
 until some persuasive measurements show a situation in which it matters.
 

Yes, likely below noise. There is an issue with a behavioural
transition point in the wakeup code where you might see good
behaviour with 21 and bad with 25, or vice versa on some workloads.
This is fixed in the scheduler patches coming through -mm though.

But I wasn't worried so much about the absolute value not being
right, rather it maybe not being deterministic. So maybe depending
on what CPU gets assigned what cpuid, you might get different
values on identical machines.

 As you say - not an exact science.
 




-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-02 Thread Ingo Molnar

* Paul Jackson <[EMAIL PROTECTED]> wrote:

> Just so as no else wastes time repeating the little bit I've done so 
> far, and so I don't waste time figuring out what is already known, 
> here's what I have so far, trying out Ingo's "sched: auto-tune 
> migration costs" on ia64 SN2:
> 
> To get it to compile against 2.6.12-rc1-mm4, I did thus:
> 
>   1. Manually edited "include/asm-x86_64/topology.h" to
>  remove .cache_hot_time (patch failed due to conflicts
>  with nearby changes to add some *_idx terms).

(next time you can ignore that hunk - we override the cache_hot_time 
value anyway.)

>   2. Moved the 394 line block of new code in kernel/sched.c
>  to _before_ the large  #ifdef ARCH_HAS_SCHED_DOMAIN,
>  #else, #endif block.  The ia64 arch (only) defines
>  ARCH_HAS_SCHED_DOMAIN, so was being denied use of Ingo's
>  code when it was buried in the '#else-#endif' side of
>  this large conditional block.

yeah, indeed. The place you moved it to is the right spot, as it's under 
CONFIG_SMP. I've done this in my tree too.

>   3. Add "#include " to kernel/sched.c

ok, did this in my tree too.

>   4. Don't print cpu_khz in the cost matrix header, as cpu_khz
>  is only in a few arch's (x86_64, ppc, i386, arm).

ok.

> Brought up 8 CPUs
> softlockup thread 7 started up.
> Total of 8 processors activated (15548.60 BogoMIPS).
> -
> migration cost matrix (max_cache_size: 33554432):
> -
>   [00][01][02][03][04][05][06][07]
> [00]: -
> = end =
> 
> Then it hung for 5 or 10 minutes, [...]

the default on ia64 (32MB) was way too large and caused the search to 
start from 64MB. That can take a _long_ time.

i've attached a new patch with your changes included, and a couple of 
new things added:

 - removed the 32MB max_cache_size hack from ia64 - it should now fall 
   back to the default 5MB and do a search from 10MB downwards. This
   should speed up the search.

 - added a migration_debug boot option - use it to get verbose printouts 
   about the search for the migration cost.

 - added a max_cache_size= boot option for debugging.

 - a few cleanups

(in the next iteration of the patch i'll try a new method to further 
speed up the search - but didnt want to change it too much in this 
iteration.)

>  [] schedule_work+0x30/0x60
> sp=e1b03a8d7910 bsp=e1b03a8d14c8
>  [] blank_screen_t+0x30/0x60
> sp=e1b03a8d7910 bsp=e1b03a8d14b8
>  [] run_timer_softirq+0x2d0/0x4a0
> sp=e1b03a8d7910 bsp=e1b03a8d1410

i think the crash is an unrelated bug: it seems the screen blanking 
timer hit and has crashed the box - i suspect it didnt expect the bootup 
to take that long.

Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -4639,6 +4640,438 @@ void __devinit init_sched_build_groups(s
last->next = first;
 }
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads a big buffer to flush caches
+ * 2) the source CPU reads+dirties a shared buffer 
+ * 3) the target CPU reads+dirties the same shared buffer
+ * 4) the target CPU reads a big buffer to flush caches
+ *
+ * We measure how long steps #2 and #3 take (step #1 and #4 is not
+ * measured), in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a large buffer-size and iterate down to smaller
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * do a maximum search for the cost. The maximum cost for a migration
+ * occurs when the working set is just below the effective cache size.
+ */
+
+
+/*
+ * Flush the cache by reading a big buffer. (We want all writeback
+ * activities to subside. Works only if cache size is larger than
+ * 2*size, but that is good enough as the biggest migration effect
+ * is around cachesize size.)
+ */
+__init static void read_cache(void *__cache, unsigned long __size)
+{
+   unsigned long size = __size/sizeof(long);
+   unsigned long *cache = __cache;
+   volatile unsigned long data;
+   int i;
+
+   for (i = 0; i < 2*size; i += 4)
+   data = cache[i];
+}
+
+
+/*
+ * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
+ * is the operation that is timed, 

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-02 Thread Paul Jackson
Just so as no else wastes time repeating the little bit I've done so
far, and so I don't waste time figuring out what is already known,
here's what I have so far, trying out Ingo's "sched: auto-tune migration
costs" on ia64 SN2:

To get it to compile against 2.6.12-rc1-mm4, I did thus:

  1. Manually edited "include/asm-x86_64/topology.h" to
 remove .cache_hot_time (patch failed due to conflicts
 with nearby changes to add some *_idx terms).
  2. Moved the 394 line block of new code in kernel/sched.c
 to _before_ the large  #ifdef ARCH_HAS_SCHED_DOMAIN,
 #else, #endif block.  The ia64 arch (only) defines
 ARCH_HAS_SCHED_DOMAIN, so was being denied use of Ingo's
 code when it was buried in the '#else-#endif' side of
 this large conditional block.
  3. Add "#include " to kernel/sched.c
  4. Don't print cpu_khz in the cost matrix header, as cpu_khz
 is only in a few arch's (x86_64, ppc, i386, arm).

Note that (2) was just a superficial fix - it compiles, but the result
could easily be insanely stupid and I'd have no clue.  I need to
read the code some more.

Booting on an 8 CPU ia64 SN2, the console output got far enough to show:

 begin 
Brought up 8 CPUs
softlockup thread 7 started up.
Total of 8 processors activated (15548.60 BogoMIPS).
-
migration cost matrix (max_cache_size: 33554432):
-
  [00][01][02][03][04][05][06][07]
[00]: -
= end =

Then it hung for 5 or 10 minutes, and then it blurted out a panic and
died. I'll quote the whole panic, including backtrace, in case someone
happens to see something obvious.

But I'm not asking anyone to think about this yet, unless it amuses
them.  I can usefully occupy myself reading the code and adding printk's
for a while.

Note the first 3 chars of the panic message "4.5".  This looks like it
might be the [00]-[01] entry of Ingo's table, flushed out when the
newlines of the panic came through.

 begin 
4.5(0)<1>Unable to handle kernel paging request at virtual address 
00010008
swapper[1]: Oops 8813272891392 [1]
Modules linked in:

Pid: 1, CPU 0, comm:  swapper
psr : 101008026018 ifs : 8288 ip  : []Not 
tainted
ip is at queue_work+0xb0/0x1a0
unat:  pfs : 0288 rsc : 0003
rnat: a00100ab2a50 bsps: 0010 pr  : 5a6956996a65
ldrs:  ccv :  fpsr: 0009804c8a70033f
csd :  ssd : 
b0  : a001000d99b0 b6  : a0013320 b7  : a00100490200
f6  : 1003e9ff7 f7  : 1003e000418d3645db265
f8  : 1003e3b8186ed f9  : 1003e5f3b
f10 : 1003e1000 f11 : 1003e0040
r1  : a00100c9de60 r2  :  r3  : 0001
r8  :  r9  :  r10 : a00100969c50
r11 : 0004 r12 : e1b03a8d7910 r13 : e1b03a8d
r14 :  r15 : 00010008 r16 : e1b03a8d0dc0
r17 : 00010008 r18 : 0103 r19 : a00100c32048
r20 : a00100c32018 r21 : a00100aa92c8 r22 : e03003005d90
r23 : e03003005da8 r24 : a00100cf2098 r25 : e03003005db0
r26 : a00100ab4bf4 r27 : e03003005d81 r28 : 00010004b001
r29 :  r30 : 00010004b000 r31 : a00100c32010

Call Trace:
 [] show_stack+0x80/0xa0
sp=e1b03a8d74d0 bsp=e1b03a8d1620
 [] show_regs+0x860/0x880
sp=e1b03a8d76a0 bsp=e1b03a8d15b8
 [] die+0x170/0x200
sp=e1b03a8d76b0 bsp=e1b03a8d1580
 [] ia64_do_page_fault+0x200/0xa40
sp=e1b03a8d76b0 bsp=e1b03a8d1520
 [] ia64_leave_kernel+0x0/0x290
sp=e1b03a8d7740 bsp=e1b03a8d1520
 [] queue_work+0xb0/0x1a0
sp=e1b03a8d7910 bsp=e1b03a8d14e0
 [] schedule_work+0x30/0x60
sp=e1b03a8d7910 bsp=e1b03a8d14c8
 [] blank_screen_t+0x30/0x60
sp=e1b03a8d7910 bsp=e1b03a8d14b8
 [] run_timer_softirq+0x2d0/0x4a0
sp=e1b03a8d7910 bsp=e1b03a8d1410
 [] __do_softirq+0x220/0x260
sp=e1b03a8d7930 bsp=e1b03a8d1378
 [] do_softirq+0x80/0xe0
sp=e1b03a8d7930 bsp=e1b03a8d1320
 [] irq_exit+0x90/0xc0
sp=e1b03a8d7930 bsp=e1b03a8d1310
 [] ia64_handle_irq+0x110/0x140
sp=e1b03a8d7930 bsp=e1b03a8d12d8
 [] ia64_leave_kernel+0x0/0x290

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-02 Thread Paul Jackson
Ingo wrote:
> in theory the code should work fine on ia64 as well,

Nice.  I'll try it on our SN2 Altix IA64 as well.
Though I am being delayed a day or two in this
by irrelevant problems.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-02 Thread Paul Jackson
Ingo wrote:
 in theory the code should work fine on ia64 as well,

Nice.  I'll try it on our SN2 Altix IA64 as well.
Though I am being delayed a day or two in this
by irrelevant problems.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson [EMAIL PROTECTED] 1.650.933.1373, 
1.925.600.0401
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-02 Thread Paul Jackson
Just so as no else wastes time repeating the little bit I've done so
far, and so I don't waste time figuring out what is already known,
here's what I have so far, trying out Ingo's sched: auto-tune migration
costs on ia64 SN2:

To get it to compile against 2.6.12-rc1-mm4, I did thus:

  1. Manually edited include/asm-x86_64/topology.h to
 remove .cache_hot_time (patch failed due to conflicts
 with nearby changes to add some *_idx terms).
  2. Moved the 394 line block of new code in kernel/sched.c
 to _before_ the large  #ifdef ARCH_HAS_SCHED_DOMAIN,
 #else, #endif block.  The ia64 arch (only) defines
 ARCH_HAS_SCHED_DOMAIN, so was being denied use of Ingo's
 code when it was buried in the '#else-#endif' side of
 this large conditional block.
  3. Add #include linux/vmalloc.h to kernel/sched.c
  4. Don't print cpu_khz in the cost matrix header, as cpu_khz
 is only in a few arch's (x86_64, ppc, i386, arm).

Note that (2) was just a superficial fix - it compiles, but the result
could easily be insanely stupid and I'd have no clue.  I need to
read the code some more.

Booting on an 8 CPU ia64 SN2, the console output got far enough to show:

 begin 
Brought up 8 CPUs
softlockup thread 7 started up.
Total of 8 processors activated (15548.60 BogoMIPS).
-
migration cost matrix (max_cache_size: 33554432):
-
  [00][01][02][03][04][05][06][07]
[00]: -
= end =

Then it hung for 5 or 10 minutes, and then it blurted out a panic and
died. I'll quote the whole panic, including backtrace, in case someone
happens to see something obvious.

But I'm not asking anyone to think about this yet, unless it amuses
them.  I can usefully occupy myself reading the code and adding printk's
for a while.

Note the first 3 chars of the panic message 4.5.  This looks like it
might be the [00]-[01] entry of Ingo's table, flushed out when the
newlines of the panic came through.

 begin 
4.5(0)1Unable to handle kernel paging request at virtual address 
00010008
swapper[1]: Oops 8813272891392 [1]
Modules linked in:

Pid: 1, CPU 0, comm:  swapper
psr : 101008026018 ifs : 8288 ip  : [a001000d9a30]Not 
tainted
ip is at queue_work+0xb0/0x1a0
unat:  pfs : 0288 rsc : 0003
rnat: a00100ab2a50 bsps: 0010 pr  : 5a6956996a65
ldrs:  ccv :  fpsr: 0009804c8a70033f
csd :  ssd : 
b0  : a001000d99b0 b6  : a0013320 b7  : a00100490200
f6  : 1003e9ff7 f7  : 1003e000418d3645db265
f8  : 1003e3b8186ed f9  : 1003e5f3b
f10 : 1003e1000 f11 : 1003e0040
r1  : a00100c9de60 r2  :  r3  : 0001
r8  :  r9  :  r10 : a00100969c50
r11 : 0004 r12 : e1b03a8d7910 r13 : e1b03a8d
r14 :  r15 : 00010008 r16 : e1b03a8d0dc0
r17 : 00010008 r18 : 0103 r19 : a00100c32048
r20 : a00100c32018 r21 : a00100aa92c8 r22 : e03003005d90
r23 : e03003005da8 r24 : a00100cf2098 r25 : e03003005db0
r26 : a00100ab4bf4 r27 : e03003005d81 r28 : 00010004b001
r29 :  r30 : 00010004b000 r31 : a00100c32010

Call Trace:
 [a00100010460] show_stack+0x80/0xa0
sp=e1b03a8d74d0 bsp=e1b03a8d1620
 [a00100010d40] show_regs+0x860/0x880
sp=e1b03a8d76a0 bsp=e1b03a8d15b8
 [a00100036390] die+0x170/0x200
sp=e1b03a8d76b0 bsp=e1b03a8d1580
 [a0010005bb20] ia64_do_page_fault+0x200/0xa40
sp=e1b03a8d76b0 bsp=e1b03a8d1520
 [a001b2c0] ia64_leave_kernel+0x0/0x290
sp=e1b03a8d7740 bsp=e1b03a8d1520
 [a001000d9a30] queue_work+0xb0/0x1a0
sp=e1b03a8d7910 bsp=e1b03a8d14e0
 [a001000db0d0] schedule_work+0x30/0x60
sp=e1b03a8d7910 bsp=e1b03a8d14c8
 [a00100490230] blank_screen_t+0x30/0x60
sp=e1b03a8d7910 bsp=e1b03a8d14b8
 [a001000c8130] run_timer_softirq+0x2d0/0x4a0
sp=e1b03a8d7910 bsp=e1b03a8d1410
 [a001000bb920] __do_softirq+0x220/0x260
sp=e1b03a8d7930 bsp=e1b03a8d1378
 [a001000bb9e0] do_softirq+0x80/0xe0
sp=e1b03a8d7930 bsp=e1b03a8d1320
 [a001000bbc50] irq_exit+0x90/0xc0

Re: [patch] sched: auto-tune migration costs [was: Re: Industry db benchmark result on recent 2.6 kernels]

2005-04-02 Thread Ingo Molnar

* Paul Jackson [EMAIL PROTECTED] wrote:

 Just so as no else wastes time repeating the little bit I've done so 
 far, and so I don't waste time figuring out what is already known, 
 here's what I have so far, trying out Ingo's sched: auto-tune 
 migration costs on ia64 SN2:
 
 To get it to compile against 2.6.12-rc1-mm4, I did thus:
 
   1. Manually edited include/asm-x86_64/topology.h to
  remove .cache_hot_time (patch failed due to conflicts
  with nearby changes to add some *_idx terms).

(next time you can ignore that hunk - we override the cache_hot_time 
value anyway.)

   2. Moved the 394 line block of new code in kernel/sched.c
  to _before_ the large  #ifdef ARCH_HAS_SCHED_DOMAIN,
  #else, #endif block.  The ia64 arch (only) defines
  ARCH_HAS_SCHED_DOMAIN, so was being denied use of Ingo's
  code when it was buried in the '#else-#endif' side of
  this large conditional block.

yeah, indeed. The place you moved it to is the right spot, as it's under 
CONFIG_SMP. I've done this in my tree too.

   3. Add #include linux/vmalloc.h to kernel/sched.c

ok, did this in my tree too.

   4. Don't print cpu_khz in the cost matrix header, as cpu_khz
  is only in a few arch's (x86_64, ppc, i386, arm).

ok.

 Brought up 8 CPUs
 softlockup thread 7 started up.
 Total of 8 processors activated (15548.60 BogoMIPS).
 -
 migration cost matrix (max_cache_size: 33554432):
 -
   [00][01][02][03][04][05][06][07]
 [00]: -
 = end =
 
 Then it hung for 5 or 10 minutes, [...]

the default on ia64 (32MB) was way too large and caused the search to 
start from 64MB. That can take a _long_ time.

i've attached a new patch with your changes included, and a couple of 
new things added:

 - removed the 32MB max_cache_size hack from ia64 - it should now fall 
   back to the default 5MB and do a search from 10MB downwards. This
   should speed up the search.

 - added a migration_debug boot option - use it to get verbose printouts 
   about the search for the migration cost.

 - added a max_cache_size=bytes boot option for debugging.

 - a few cleanups

(in the next iteration of the patch i'll try a new method to further 
speed up the search - but didnt want to change it too much in this 
iteration.)

  [a001000db0d0] schedule_work+0x30/0x60
 sp=e1b03a8d7910 bsp=e1b03a8d14c8
  [a00100490230] blank_screen_t+0x30/0x60
 sp=e1b03a8d7910 bsp=e1b03a8d14b8
  [a001000c8130] run_timer_softirq+0x2d0/0x4a0
 sp=e1b03a8d7910 bsp=e1b03a8d1410

i think the crash is an unrelated bug: it seems the screen blanking 
timer hit and has crashed the box - i suspect it didnt expect the bootup 
to take that long.

Ingo
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -47,6 +47,7 @@
 #include linux/syscalls.h
 #include linux/times.h
 #include linux/acct.h
+#include linux/vmalloc.h
 #include asm/tlb.h
 
 #include asm/unistd.h
@@ -4639,6 +4640,438 @@ void __devinit init_sched_build_groups(s
last-next = first;
 }
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads a big buffer to flush caches
+ * 2) the source CPU reads+dirties a shared buffer 
+ * 3) the target CPU reads+dirties the same shared buffer
+ * 4) the target CPU reads a big buffer to flush caches
+ *
+ * We measure how long steps #2 and #3 take (step #1 and #4 is not
+ * measured), in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a large buffer-size and iterate down to smaller
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * do a maximum search for the cost. The maximum cost for a migration
+ * occurs when the working set is just below the effective cache size.
+ */
+
+
+/*
+ * Flush the cache by reading a big buffer. (We want all writeback
+ * activities to subside. Works only if cache size is larger than
+ * 2*size, but that is good enough as the biggest migration effect
+ * is around cachesize size.)
+ */
+__init static void read_cache(void *__cache, unsigned long __size)
+{
+   unsigned long size = __size/sizeof(long);
+   unsigned long *cache = __cache;
+   volatile unsigned long data;
+   int i;
+
+   for (i = 0; i  2*size; i += 4)
+   data = cache[i];
+}
+
+
+/*
+ * Dirty